No OneTemporary
Actions

Size

6 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: vendor/llvm/dist-release_60/cmake/modules/LLVMConfig.cmake.in
	===================================================================
	--- vendor/llvm/dist-release_60/cmake/modules/LLVMConfig.cmake.in (revision 328361)
	+++ vendor/llvm/dist-release_60/cmake/modules/LLVMConfig.cmake.in (revision 328362)
	@@ -1,91 +1,93 @@
	# This file provides information and services to the final user.

	@LLVM_CONFIG_CODE@

	set(LLVM_VERSION_MAJOR @LLVM_VERSION_MAJOR@)
	set(LLVM_VERSION_MINOR @LLVM_VERSION_MINOR@)
	set(LLVM_VERSION_PATCH @LLVM_VERSION_PATCH@)
	set(LLVM_PACKAGE_VERSION @PACKAGE_VERSION@)

	set(LLVM_BUILD_TYPE @CMAKE_BUILD_TYPE@)

	set(LLVM_COMMON_DEPENDS @LLVM_COMMON_DEPENDS@)

	set(LLVM_AVAILABLE_LIBS @LLVM_AVAILABLE_LIBS@)

	set(LLVM_ALL_TARGETS @LLVM_ALL_TARGETS@)

	set(LLVM_TARGETS_TO_BUILD @LLVM_TARGETS_TO_BUILD@)

	set(LLVM_TARGETS_WITH_JIT @LLVM_TARGETS_WITH_JIT@)

	@all_llvm_lib_deps@

	set(TARGET_TRIPLE "@TARGET_TRIPLE@")

	set(LLVM_ABI_BREAKING_CHECKS @LLVM_ABI_BREAKING_CHECKS@)

	set(LLVM_ENABLE_ASSERTIONS @LLVM_ENABLE_ASSERTIONS@)

	set(LLVM_ENABLE_EH @LLVM_ENABLE_EH@)

	set(LLVM_ENABLE_RTTI @LLVM_ENABLE_RTTI@)

	set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@)

	set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)

	set(LLVM_ENABLE_ZLIB @LLVM_ENABLE_ZLIB@)

	+set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@)
	+
	set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@)

	set(LLVM_NATIVE_ARCH @LLVM_NATIVE_ARCH@)

	set(LLVM_ENABLE_PIC @LLVM_ENABLE_PIC@)

	set(LLVM_BUILD_32_BITS @LLVM_BUILD_32_BITS@)

	if (NOT "@LLVM_PTHREAD_LIB@" STREQUAL "")
	set(LLVM_PTHREAD_LIB "@LLVM_PTHREAD_LIB@")
	endif()

	set(LLVM_ENABLE_PLUGINS @LLVM_ENABLE_PLUGINS@)
	set(LLVM_EXPORT_SYMBOLS_FOR_PLUGINS @LLVM_EXPORT_SYMBOLS_FOR_PLUGINS@)
	set(LLVM_PLUGIN_EXT @LLVM_PLUGIN_EXT@)

	set(LLVM_ON_UNIX @LLVM_ON_UNIX@)
	set(LLVM_ON_WIN32 @LLVM_ON_WIN32@)

	set(LLVM_LIBDIR_SUFFIX @LLVM_LIBDIR_SUFFIX@)

	set(LLVM_INCLUDE_DIRS "@LLVM_CONFIG_INCLUDE_DIRS@")
	set(LLVM_LIBRARY_DIRS "@LLVM_CONFIG_LIBRARY_DIRS@")

	# These variables are duplicated, but they must match the LLVM variables of the
	# same name. The variables ending in "S" could some day become lists, and are
	# preserved for convention and compatibility.
	set(LLVM_INCLUDE_DIR "@LLVM_CONFIG_INCLUDE_DIRS@")
	set(LLVM_LIBRARY_DIR "@LLVM_CONFIG_LIBRARY_DIRS@")

	set(LLVM_DEFINITIONS "@LLVM_DEFINITIONS@")
	set(LLVM_CMAKE_DIR "@LLVM_CONFIG_CMAKE_DIR@")
	set(LLVM_BINARY_DIR "@LLVM_CONFIG_BINARY_DIR@")
	set(LLVM_TOOLS_BINARY_DIR "@LLVM_CONFIG_TOOLS_BINARY_DIR@")
	set(LLVM_TOOLS_INSTALL_DIR "@LLVM_TOOLS_INSTALL_DIR@")
	set(LLVM_HAVE_OPT_VIEWER_MODULES @LLVM_HAVE_OPT_VIEWER_MODULES@)

	if(NOT TARGET LLVMSupport)
	set(LLVM_EXPORTED_TARGETS "@LLVM_CONFIG_EXPORTS@")
	include("@LLVM_CONFIG_EXPORTS_FILE@")
	@llvm_config_include_buildtree_only_exports@
	endif()

	# By creating intrinsics_gen here, subprojects that depend on LLVM's
	# tablegen-generated headers can always depend on this target whether building
	# in-tree with LLVM or not.
	if(NOT TARGET intrinsics_gen)
	add_custom_target(intrinsics_gen)
	endif()

	set_property(GLOBAL PROPERTY LLVM_TARGETS_CONFIGURED On)
	include(${LLVM_CMAKE_DIR}/LLVM-Config.cmake)
	Index: vendor/llvm/dist-release_60/docs/ReleaseNotes.rst
	===================================================================
	--- vendor/llvm/dist-release_60/docs/ReleaseNotes.rst (revision 328361)
	+++ vendor/llvm/dist-release_60/docs/ReleaseNotes.rst (revision 328362)
	@@ -1,133 +1,179 @@
	========================
	LLVM 6.0.0 Release Notes
	========================

	.. contents::
	:local:

	.. warning::
	These are in-progress notes for the upcoming LLVM 6 release.
	Release notes for previous releases can be found on
	`the Download Page <http://releases.llvm.org/download.html>`_.


	Introduction
	============

	This document contains the release notes for the LLVM Compiler Infrastructure,
	release 5.0.0. Here we describe the status of LLVM, including major improvements
	from the previous release, improvements in various subprojects of LLVM, and
	some of the current users of the code. All LLVM releases may be downloaded
	from the `LLVM releases web site <http://llvm.org/releases/>`_.

	For more information about LLVM, including information about the latest
	release, please check out the `main LLVM web site <http://llvm.org/>`_. If you
	have questions or comments, the `LLVM Developer's Mailing List
	<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
	them.

	Note that if you are reading this file from a Subversion checkout or the main
	LLVM web page, this document applies to the next release, not the current
	one. To see the release notes for a specific release, please see the `releases
	page <http://llvm.org/releases/>`_.

	Non-comprehensive list of changes in this release
	=================================================
	.. NOTE
	For small 1-3 sentence descriptions, just add an entry at the end of
	this list. If your description won't fit comfortably in one bullet
	point (e.g. maybe you would like to give an example of the
	functionality, or simply have a lot to talk about), see the `NOTE` below
	for adding a new subsection.

	* The ``Redirects`` argument of ``llvm::sys::ExecuteAndWait`` and
	``llvm::sys::ExecuteNoWait`` was changed to an ``ArrayRef`` of optional
	``StringRef``'s to make it safer and more convenient to use.

	* The backend name was added to the Target Registry to allow run-time
	information to be fed back into TableGen. Out-of-tree targets will need to add
	the name used in the `def X : Target` definition to the call to
	`RegisterTarget`.

	* The ``Debugify`` pass was added to ``opt`` to facilitate testing of debug
	info preservation. This pass attaches synthetic ``DILocations`` and
	``DIVariables`` to the instructions in a ``Module``. The ``CheckDebugify``
	pass determines how much of the metadata is lost.

	+* Significantly improved quality of CodeView debug info for Windows.
	+
	* Note..

	.. NOTE
	If you would like to document a larger change, then you can add a
	subsection about it right here. You can copy the following boilerplate
	and un-indent it (the indentation causes it to be inside this comment).

	Special New Feature
	-------------------

	Makes programs 10x faster by doing Special New Thing.

	Changes to the LLVM IR
	----------------------

	-Changes to the ARM Backend
	---------------------------
	+Changes to the ARM Target
	+-------------------------

	- During this release ...
	+During this release the ARM target has:

	+* Got support for enabling SjLj exception handling on platforms where it
	+ isn't the default.

	+
	Changes to the MIPS Target
	--------------------------

	During this release ...


	Changes to the PowerPC Target
	-----------------------------

	During this release ...

	Changes to the X86 Target
	-------------------------

	- During this release ...
	+During this release ...

	+* Got support for enabling SjLj exception handling on platforms where it
	+ isn't the default.
	+
	Changes to the AMDGPU Target
	-----------------------------

	During this release ...

	Changes to the AVR Target
	-----------------------------

	During this release ...

	Changes to the OCaml bindings
	-----------------------------

	During this release ...


	Changes to the C API
	--------------------

	During this release ...


	External Open Source Projects Using LLVM 6
	==========================================

	-* A project...
	+JFS - JIT Fuzzing Solver
	+------------------------

	+`JFS <https://github.com/delcypher/jfs>`_ is an experimental constraint solver
	+designed to investigate using coverage guided fuzzing as an incomplete strategy
	+for solving boolean, BitVector, and floating-point constraints.
	+It is built on top of LLVM, Clang, LibFuzzer, and Z3.
	+
	+The solver works by generating a C++ program where the reachability of an
	+`abort()` statement is equivalent to finding a satisfying assignment to the
	+constraints. This program is then compiled by Clang with `SanitizerCoverage
	+<https://releases.llvm.org/6.0.0/tools/clang/docs/SanitizerCoverage.html>`_
	+instrumentation and then fuzzed using :doc:`LibFuzzer <LibFuzzer>`.
	+
	+Zig Programming Language
	+------------------------
	+
	+`Zig <http://ziglang.org>`_ is an open-source programming language designed
	+for robustness, optimality, and clarity. It is intended to replace C. It
	+provides high level features such as Generics,
	+Compile Time Function Execution, and Partial Evaluation, yet exposes low level
	+LLVM IR features such as Aliases. Zig uses Clang to provide automatic
	+import of .h symbols - even inline functions and macros. Zig uses LLD combined
	+with lazily building compiler-rt to provide out-of-the-box cross-compiling for
	+all supported targets.
	+
	+LDC - the LLVM-based D compiler
	+-------------------------------
	+
	+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
	+pragmatically combines efficiency, control, and modeling power, with safety and
	+programmer productivity. D supports powerful concepts like Compile-Time Function
	+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
	+to concurrency and offers many classical paradigms.
	+
	+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
	+combined with LLVM as backend to produce efficient native code. LDC targets
	+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
	+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
	+are underway.

	Additional Information
	======================

	A wide variety of additional information is available on the `LLVM web page
	<http://llvm.org/>`_, in particular in the `documentation
	<http://llvm.org/docs/>`_ section. The web page also contains versions of the
	API documentation which is up-to-date with the Subversion version of the source
	code. You can access versions of these documents specific to this release by
	going into the ``llvm/docs/`` directory in the LLVM tree.

	If you have any questions or comments about LLVM, please feel free to contact
	us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
	Index: vendor/llvm/dist-release_60/include/llvm/Analysis/RegionInfoImpl.h
	===================================================================
	--- vendor/llvm/dist-release_60/include/llvm/Analysis/RegionInfoImpl.h (revision 328361)
	+++ vendor/llvm/dist-release_60/include/llvm/Analysis/RegionInfoImpl.h (revision 328362)
	@@ -1,931 +1,931 @@
	//===- RegionInfoImpl.h - SESE region detection analysis --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// Detects single entry single exit regions in the control flow graph.
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_ANALYSIS_REGIONINFOIMPL_H
	#define LLVM_ANALYSIS_REGIONINFOIMPL_H

	#include "llvm/ADT/GraphTraits.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/DominanceFrontier.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/PostDominators.h"
	#include "llvm/Analysis/RegionInfo.h"
	#include "llvm/Analysis/RegionIterator.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <iterator>
	#include <memory>
	#include <set>
	#include <string>
	#include <type_traits>
	#include <vector>

	#define DEBUG_TYPE "region"

	namespace llvm {

	//===----------------------------------------------------------------------===//
	/// RegionBase Implementation
	template <class Tr>
	RegionBase<Tr>::RegionBase(BlockT Entry, BlockT Exit,
	typename Tr::RegionInfoT RInfo, DomTreeT dt,
	RegionT *Parent)
	: RegionNodeBase<Tr>(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {}

	template <class Tr>
	RegionBase<Tr>::~RegionBase() {
	// Only clean the cache for this Region. Caches of child Regions will be
	// cleaned when the child Regions are deleted.
	BBNodeMap.clear();
	}

	template <class Tr>
	void RegionBase<Tr>::replaceEntry(BlockT *BB) {
	this->entry.setPointer(BB);
	}

	template <class Tr>
	void RegionBase<Tr>::replaceExit(BlockT *BB) {
	assert(exit && "No exit to replace!");
	exit = BB;
	}

	template <class Tr>
	void RegionBase<Tr>::replaceEntryRecursive(BlockT *NewEntry) {
	std::vector<RegionT *> RegionQueue;
	BlockT *OldEntry = getEntry();

	RegionQueue.push_back(static_cast<RegionT *>(this));
	while (!RegionQueue.empty()) {
	RegionT *R = RegionQueue.back();
	RegionQueue.pop_back();

	R->replaceEntry(NewEntry);
	for (std::unique_ptr<RegionT> &Child : *R) {
	if (Child->getEntry() == OldEntry)
	RegionQueue.push_back(Child.get());
	}
	}
	}

	template <class Tr>
	void RegionBase<Tr>::replaceExitRecursive(BlockT *NewExit) {
	std::vector<RegionT *> RegionQueue;
	BlockT *OldExit = getExit();

	RegionQueue.push_back(static_cast<RegionT *>(this));
	while (!RegionQueue.empty()) {
	RegionT *R = RegionQueue.back();
	RegionQueue.pop_back();

	R->replaceExit(NewExit);
	for (std::unique_ptr<RegionT> &Child : *R) {
	if (Child->getExit() == OldExit)
	RegionQueue.push_back(Child.get());
	}
	}
	}

	template <class Tr>
	bool RegionBase<Tr>::contains(const BlockT *B) const {
	BlockT BB = const_cast<BlockT >(B);

	if (!DT->getNode(BB))
	return false;

	BlockT entry = getEntry(), exit = getExit();

	// Toplevel region.
	if (!exit)
	return true;

	return (DT->dominates(entry, BB) &&
	!(DT->dominates(exit, BB) && DT->dominates(entry, exit)));
	}

	template <class Tr>
	bool RegionBase<Tr>::contains(const LoopT *L) const {
	// BBs that are not part of any loop are element of the Loop
	// described by the NULL pointer. This loop is not part of any region,
	// except if the region describes the whole function.
	if (!L)
	return getExit() == nullptr;

	if (!contains(L->getHeader()))
	return false;

	SmallVector<BlockT *, 8> ExitingBlocks;
	L->getExitingBlocks(ExitingBlocks);

	for (BlockT *BB : ExitingBlocks) {
	if (!contains(BB))
	return false;
	}

	return true;
	}

	template <class Tr>
	typename Tr::LoopT RegionBase<Tr>::outermostLoopInRegion(LoopT L) const {
	if (!contains(L))
	return nullptr;

	while (L && contains(L->getParentLoop())) {
	L = L->getParentLoop();
	}

	return L;
	}

	template <class Tr>
	typename Tr::LoopT RegionBase<Tr>::outermostLoopInRegion(LoopInfoT LI,
	BlockT *BB) const {
	assert(LI && BB && "LI and BB cannot be null!");
	LoopT *L = LI->getLoopFor(BB);
	return outermostLoopInRegion(L);
	}

	template <class Tr>
	typename RegionBase<Tr>::BlockT *RegionBase<Tr>::getEnteringBlock() const {
	BlockT *entry = getEntry();
	BlockT *enteringBlock = nullptr;

	for (BlockT *Pred : make_range(InvBlockTraits::child_begin(entry),
	InvBlockTraits::child_end(entry))) {
	if (DT->getNode(Pred) && !contains(Pred)) {
	if (enteringBlock)
	return nullptr;

	enteringBlock = Pred;
	}
	}

	return enteringBlock;
	}

	template <class Tr>
	bool RegionBase<Tr>::getExitingBlocks(
	SmallVectorImpl<BlockT *> &Exitings) const {
	bool CoverAll = true;

	if (!exit)
	return CoverAll;

	for (PredIterTy PI = InvBlockTraits::child_begin(exit),
	PE = InvBlockTraits::child_end(exit);
	PI != PE; ++PI) {
	BlockT Pred = PI;
	if (contains(Pred)) {
	Exitings.push_back(Pred);
	continue;
	}

	CoverAll = false;
	}

	return CoverAll;
	}

	template <class Tr>
	typename RegionBase<Tr>::BlockT *RegionBase<Tr>::getExitingBlock() const {
	BlockT *exit = getExit();
	BlockT *exitingBlock = nullptr;

	if (!exit)
	return nullptr;

	for (BlockT *Pred : make_range(InvBlockTraits::child_begin(exit),
	InvBlockTraits::child_end(exit))) {
	if (contains(Pred)) {
	if (exitingBlock)
	return nullptr;

	exitingBlock = Pred;
	}
	}

	return exitingBlock;
	}

	template <class Tr>
	bool RegionBase<Tr>::isSimple() const {
	return !isTopLevelRegion() && getEnteringBlock() && getExitingBlock();
	}

	template <class Tr>
	std::string RegionBase<Tr>::getNameStr() const {
	std::string exitName;
	std::string entryName;

	if (getEntry()->getName().empty()) {
	raw_string_ostream OS(entryName);

	getEntry()->printAsOperand(OS, false);
	} else
	entryName = getEntry()->getName();

	if (getExit()) {
	if (getExit()->getName().empty()) {
	raw_string_ostream OS(exitName);

	getExit()->printAsOperand(OS, false);
	} else
	exitName = getExit()->getName();
	} else
	exitName = "<Function Return>";

	return entryName + " => " + exitName;
	}

	template <class Tr>
	void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
	if (!contains(BB))
	- llvm_unreachable("Broken region found: enumerated BB not in region!");
	+ report_fatal_error("Broken region found: enumerated BB not in region!");

	BlockT entry = getEntry(), exit = getExit();

	for (BlockT *Succ :
	make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
	if (!contains(Succ) && exit != Succ)
	- llvm_unreachable("Broken region found: edges leaving the region must go "
	- "to the exit node!");
	+ report_fatal_error("Broken region found: edges leaving the region must go "
	+ "to the exit node!");
	}

	if (entry != BB) {
	for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
	InvBlockTraits::child_end(BB))) {
	if (!contains(Pred))
	- llvm_unreachable("Broken region found: edges entering the region must "
	- "go to the entry node!");
	+ report_fatal_error("Broken region found: edges entering the region must "
	+ "go to the entry node!");
	}
	}
	}

	template <class Tr>
	void RegionBase<Tr>::verifyWalk(BlockT BB, std::set<BlockT > *visited) const {
	BlockT *exit = getExit();

	visited->insert(BB);

	verifyBBInRegion(BB);

	for (BlockT *Succ :
	make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
	if (Succ != exit && visited->find(Succ) == visited->end())
	verifyWalk(Succ, visited);
	}
	}

	template <class Tr>
	void RegionBase<Tr>::verifyRegion() const {
	// Only do verification when user wants to, otherwise this expensive check
	// will be invoked by PMDataManager::verifyPreservedAnalysis when
	// a regionpass (marked PreservedAll) finish.
	if (!RegionInfoBase<Tr>::VerifyRegionInfo)
	return;

	std::set<BlockT *> visited;
	verifyWalk(getEntry(), &visited);
	}

	template <class Tr>
	void RegionBase<Tr>::verifyRegionNest() const {
	for (const std::unique_ptr<RegionT> &R : *this)
	R->verifyRegionNest();

	verifyRegion();
	}

	template <class Tr>
	typename RegionBase<Tr>::element_iterator RegionBase<Tr>::element_begin() {
	return GraphTraits<RegionT >::nodes_begin(static_cast<RegionT >(this));
	}

	template <class Tr>
	typename RegionBase<Tr>::element_iterator RegionBase<Tr>::element_end() {
	return GraphTraits<RegionT >::nodes_end(static_cast<RegionT >(this));
	}

	template <class Tr>
	typename RegionBase<Tr>::const_element_iterator
	RegionBase<Tr>::element_begin() const {
	return GraphTraits<const RegionT *>::nodes_begin(
	static_cast<const RegionT *>(this));
	}

	template <class Tr>
	typename RegionBase<Tr>::const_element_iterator
	RegionBase<Tr>::element_end() const {
	return GraphTraits<const RegionT *>::nodes_end(
	static_cast<const RegionT *>(this));
	}

	template <class Tr>
	typename Tr::RegionT RegionBase<Tr>::getSubRegionNode(BlockT BB) const {
	using RegionT = typename Tr::RegionT;

	RegionT *R = RI->getRegionFor(BB);

	if (!R \|\| R == this)
	return nullptr;

	// If we pass the BB out of this region, that means our code is broken.
	assert(contains(R) && "BB not in current region!");

	while (contains(R->getParent()) && R->getParent() != this)
	R = R->getParent();

	if (R->getEntry() != BB)
	return nullptr;

	return R;
	}

	template <class Tr>
	typename Tr::RegionNodeT RegionBase<Tr>::getBBNode(BlockT BB) const {
	assert(contains(BB) && "Can get BB node out of this region!");

	typename BBNodeMapT::const_iterator at = BBNodeMap.find(BB);

	if (at == BBNodeMap.end()) {
	auto Deconst = const_cast<RegionBase<Tr> *>(this);
	typename BBNodeMapT::value_type V = {
	BB,
	llvm::make_unique<RegionNodeT>(static_cast<RegionT *>(Deconst), BB)};
	at = BBNodeMap.insert(std::move(V)).first;
	}
	return at->second.get();
	}

	template <class Tr>
	typename Tr::RegionNodeT RegionBase<Tr>::getNode(BlockT BB) const {
	assert(contains(BB) && "Can get BB node out of this region!");
	if (RegionT *Child = getSubRegionNode(BB))
	return Child->getNode();

	return getBBNode(BB);
	}

	template <class Tr>
	void RegionBase<Tr>::transferChildrenTo(RegionT *To) {
	for (std::unique_ptr<RegionT> &R : *this) {
	R->parent = To;
	To->children.push_back(std::move(R));
	}
	children.clear();
	}

	template <class Tr>
	void RegionBase<Tr>::addSubRegion(RegionT *SubRegion, bool moveChildren) {
	assert(!SubRegion->parent && "SubRegion already has a parent!");
	assert(llvm::find_if(*this,
	[&](const std::unique_ptr<RegionT> &R) {
	return R.get() == SubRegion;
	}) == children.end() &&
	"Subregion already exists!");

	SubRegion->parent = static_cast<RegionT *>(this);
	children.push_back(std::unique_ptr<RegionT>(SubRegion));

	if (!moveChildren)
	return;

	assert(SubRegion->children.empty() &&
	"SubRegions that contain children are not supported");

	for (RegionNodeT *Element : elements()) {
	if (!Element->isSubRegion()) {
	BlockT *BB = Element->template getNodeAs<BlockT>();

	if (SubRegion->contains(BB))
	RI->setRegionFor(BB, SubRegion);
	}
	}

	std::vector<std::unique_ptr<RegionT>> Keep;
	for (std::unique_ptr<RegionT> &R : *this) {
	if (SubRegion->contains(R.get()) && R.get() != SubRegion) {
	R->parent = SubRegion;
	SubRegion->children.push_back(std::move(R));
	} else
	Keep.push_back(std::move(R));
	}

	children.clear();
	children.insert(
	children.begin(),
	std::move_iterator<typename RegionSet::iterator>(Keep.begin()),
	std::move_iterator<typename RegionSet::iterator>(Keep.end()));
	}

	template <class Tr>
	typename Tr::RegionT RegionBase<Tr>::removeSubRegion(RegionT Child) {
	assert(Child->parent == this && "Child is not a child of this region!");
	Child->parent = nullptr;
	typename RegionSet::iterator I =
	llvm::find_if(children, [&](const std::unique_ptr<RegionT> &R) {
	return R.get() == Child;
	});
	assert(I != children.end() && "Region does not exit. Unable to remove.");
	children.erase(children.begin() + (I - begin()));
	return Child;
	}

	template <class Tr>
	unsigned RegionBase<Tr>::getDepth() const {
	unsigned Depth = 0;

	for (RegionT *R = getParent(); R != nullptr; R = R->getParent())
	++Depth;

	return Depth;
	}

	template <class Tr>
	typename Tr::RegionT *RegionBase<Tr>::getExpandedRegion() const {
	unsigned NumSuccessors = Tr::getNumSuccessors(exit);

	if (NumSuccessors == 0)
	return nullptr;

	RegionT *R = RI->getRegionFor(exit);

	if (R->getEntry() != exit) {
	for (BlockT *Pred : make_range(InvBlockTraits::child_begin(getExit()),
	InvBlockTraits::child_end(getExit())))
	if (!contains(Pred))
	return nullptr;
	if (Tr::getNumSuccessors(exit) == 1)
	return new RegionT(getEntry(), *BlockTraits::child_begin(exit), RI, DT);
	return nullptr;
	}

	while (R->getParent() && R->getParent()->getEntry() == exit)
	R = R->getParent();

	for (BlockT *Pred : make_range(InvBlockTraits::child_begin(getExit()),
	InvBlockTraits::child_end(getExit()))) {
	if (!(contains(Pred) \|\| R->contains(Pred)))
	return nullptr;
	}

	return new RegionT(getEntry(), R->getExit(), RI, DT);
	}

	template <class Tr>
	void RegionBase<Tr>::print(raw_ostream &OS, bool print_tree, unsigned level,
	PrintStyle Style) const {
	if (print_tree)
	OS.indent(level * 2) << '[' << level << "] " << getNameStr();
	else
	OS.indent(level * 2) << getNameStr();

	OS << '\n';

	if (Style != PrintNone) {
	OS.indent(level * 2) << "{\n";
	OS.indent(level * 2 + 2);

	if (Style == PrintBB) {
	for (const auto *BB : blocks())
	OS << BB->getName() << ", "; // TODO: remove the last ","
	} else if (Style == PrintRN) {
	for (const RegionNodeT *Element : elements()) {
	OS << *Element << ", "; // TODO: remove the last ",
	}
	}

	OS << '\n';
	}

	if (print_tree) {
	for (const std::unique_ptr<RegionT> &R : *this)
	R->print(OS, print_tree, level + 1, Style);
	}

	if (Style != PrintNone)
	OS.indent(level * 2) << "} \n";
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	template <class Tr>
	void RegionBase<Tr>::dump() const {
	print(dbgs(), true, getDepth(), RegionInfoBase<Tr>::printStyle);
	}
	#endif

	template <class Tr>
	void RegionBase<Tr>::clearNodeCache() {
	BBNodeMap.clear();
	for (std::unique_ptr<RegionT> &R : *this)
	R->clearNodeCache();
	}

	//===----------------------------------------------------------------------===//
	// RegionInfoBase implementation
	//

	template <class Tr>
	RegionInfoBase<Tr>::RegionInfoBase() = default;

	template <class Tr>
	RegionInfoBase<Tr>::~RegionInfoBase() {
	releaseMemory();
	}

	template <class Tr>
	void RegionInfoBase<Tr>::verifyBBMap(const RegionT *R) const {
	assert(R && "Re must be non-null");
	for (const typename Tr::RegionNodeT *Element : R->elements()) {
	if (Element->isSubRegion()) {
	const RegionT *SR = Element->template getNodeAs<RegionT>();
	verifyBBMap(SR);
	} else {
	BlockT *BB = Element->template getNodeAs<BlockT>();
	if (getRegionFor(BB) != R)
	- llvm_unreachable("BB map does not match region nesting");
	+ report_fatal_error("BB map does not match region nesting");
	}
	}
	}

	template <class Tr>
	bool RegionInfoBase<Tr>::isCommonDomFrontier(BlockT BB, BlockT entry,
	BlockT *exit) const {
	for (BlockT *P : make_range(InvBlockTraits::child_begin(BB),
	InvBlockTraits::child_end(BB))) {
	if (DT->dominates(entry, P) && !DT->dominates(exit, P))
	return false;
	}

	return true;
	}

	template <class Tr>
	bool RegionInfoBase<Tr>::isRegion(BlockT entry, BlockT exit) const {
	assert(entry && exit && "entry and exit must not be null!");

	using DST = typename DomFrontierT::DomSetType;

	DST *entrySuccs = &DF->find(entry)->second;

	// Exit is the header of a loop that contains the entry. In this case,
	// the dominance frontier must only contain the exit.
	if (!DT->dominates(entry, exit)) {
	for (typename DST::iterator SI = entrySuccs->begin(),
	SE = entrySuccs->end();
	SI != SE; ++SI) {
	if (SI != exit && SI != entry)
	return false;
	}

	return true;
	}

	DST *exitSuccs = &DF->find(exit)->second;

	// Do not allow edges leaving the region.
	for (BlockT Succ : entrySuccs) {
	if (Succ == exit \|\| Succ == entry)
	continue;
	if (exitSuccs->find(Succ) == exitSuccs->end())
	return false;
	if (!isCommonDomFrontier(Succ, entry, exit))
	return false;
	}

	// Do not allow edges pointing into the region.
	for (BlockT Succ : exitSuccs) {
	if (DT->properlyDominates(entry, Succ) && Succ != exit)
	return false;
	}

	return true;
	}

	template <class Tr>
	void RegionInfoBase<Tr>::insertShortCut(BlockT entry, BlockT exit,
	BBtoBBMap *ShortCut) const {
	assert(entry && exit && "entry and exit must not be null!");

	typename BBtoBBMap::iterator e = ShortCut->find(exit);

	if (e == ShortCut->end())
	// No further region at exit available.
	(*ShortCut)[entry] = exit;
	else {
	// We found a region e that starts at exit. Therefore (entry, e->second)
	// is also a region, that is larger than (entry, exit). Insert the
	// larger one.
	BlockT *BB = e->second;
	(*ShortCut)[entry] = BB;
	}
	}

	template <class Tr>
	typename Tr::DomTreeNodeT *
	RegionInfoBase<Tr>::getNextPostDom(DomTreeNodeT N, BBtoBBMap ShortCut) const {
	typename BBtoBBMap::iterator e = ShortCut->find(N->getBlock());

	if (e == ShortCut->end())
	return N->getIDom();

	return PDT->getNode(e->second)->getIDom();
	}

	template <class Tr>
	bool RegionInfoBase<Tr>::isTrivialRegion(BlockT entry, BlockT exit) const {
	assert(entry && exit && "entry and exit must not be null!");

	unsigned num_successors =
	BlockTraits::child_end(entry) - BlockTraits::child_begin(entry);

	if (num_successors <= 1 && exit == *(BlockTraits::child_begin(entry)))
	return true;

	return false;
	}

	template <class Tr>
	typename Tr::RegionT RegionInfoBase<Tr>::createRegion(BlockT entry,
	BlockT *exit) {
	assert(entry && exit && "entry and exit must not be null!");

	if (isTrivialRegion(entry, exit))
	return nullptr;

	RegionT *region =
	new RegionT(entry, exit, static_cast<RegionInfoT *>(this), DT);
	BBtoRegion.insert({entry, region});

	#ifdef EXPENSIVE_CHECKS
	region->verifyRegion();
	#else
	DEBUG(region->verifyRegion());
	#endif

	updateStatistics(region);
	return region;
	}

	template <class Tr>
	void RegionInfoBase<Tr>::findRegionsWithEntry(BlockT *entry,
	BBtoBBMap *ShortCut) {
	assert(entry);

	DomTreeNodeT *N = PDT->getNode(entry);
	if (!N)
	return;

	RegionT *lastRegion = nullptr;
	BlockT *lastExit = entry;

	// As only a BasicBlock that postdominates entry can finish a region, walk the
	// post dominance tree upwards.
	while ((N = getNextPostDom(N, ShortCut))) {
	BlockT *exit = N->getBlock();

	if (!exit)
	break;

	if (isRegion(entry, exit)) {
	RegionT *newRegion = createRegion(entry, exit);

	if (lastRegion)
	newRegion->addSubRegion(lastRegion);

	lastRegion = newRegion;
	lastExit = exit;
	}

	// This can never be a region, so stop the search.
	if (!DT->dominates(entry, exit))
	break;
	}

	// Tried to create regions from entry to lastExit. Next time take a
	// shortcut from entry to lastExit.
	if (lastExit != entry)
	insertShortCut(entry, lastExit, ShortCut);
	}

	template <class Tr>
	void RegionInfoBase<Tr>::scanForRegions(FuncT &F, BBtoBBMap *ShortCut) {
	using FuncPtrT = typename std::add_pointer<FuncT>::type;

	BlockT *entry = GraphTraits<FuncPtrT>::getEntryNode(&F);
	DomTreeNodeT *N = DT->getNode(entry);

	// Iterate over the dominance tree in post order to start with the small
	// regions from the bottom of the dominance tree. If the small regions are
	// detected first, detection of bigger regions is faster, as we can jump
	// over the small regions.
	for (auto DomNode : post_order(N))
	findRegionsWithEntry(DomNode->getBlock(), ShortCut);
	}

	template <class Tr>
	typename Tr::RegionT RegionInfoBase<Tr>::getTopMostParent(RegionT region) {
	while (region->getParent())
	region = region->getParent();

	return region;
	}

	template <class Tr>
	void RegionInfoBase<Tr>::buildRegionsTree(DomTreeNodeT N, RegionT region) {
	BlockT *BB = N->getBlock();

	// Passed region exit
	while (BB == region->getExit())
	region = region->getParent();

	typename BBtoRegionMap::iterator it = BBtoRegion.find(BB);

	// This basic block is a start block of a region. It is already in the
	// BBtoRegion relation. Only the child basic blocks have to be updated.
	if (it != BBtoRegion.end()) {
	RegionT *newRegion = it->second;
	region->addSubRegion(getTopMostParent(newRegion));
	region = newRegion;
	} else {
	BBtoRegion[BB] = region;
	}

	for (DomTreeNodeBase<BlockT> C : N) {
	buildRegionsTree(C, region);
	}
	}

	#ifdef EXPENSIVE_CHECKS
	template <class Tr>
	bool RegionInfoBase<Tr>::VerifyRegionInfo = true;
	#else
	template <class Tr>
	bool RegionInfoBase<Tr>::VerifyRegionInfo = false;
	#endif

	template <class Tr>
	typename Tr::RegionT::PrintStyle RegionInfoBase<Tr>::printStyle =
	RegionBase<Tr>::PrintNone;

	template <class Tr>
	void RegionInfoBase<Tr>::print(raw_ostream &OS) const {
	OS << "Region tree:\n";
	TopLevelRegion->print(OS, true, 0, printStyle);
	OS << "End region tree\n";
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	template <class Tr>
	void RegionInfoBase<Tr>::dump() const { print(dbgs()); }
	#endif

	template <class Tr>
	void RegionInfoBase<Tr>::releaseMemory() {
	BBtoRegion.clear();
	if (TopLevelRegion)
	delete TopLevelRegion;
	TopLevelRegion = nullptr;
	}

	template <class Tr>
	void RegionInfoBase<Tr>::verifyAnalysis() const {
	// Do only verify regions if explicitely activated using EXPENSIVE_CHECKS or
	// -verify-region-info
	if (!RegionInfoBase<Tr>::VerifyRegionInfo)
	return;

	TopLevelRegion->verifyRegionNest();

	verifyBBMap(TopLevelRegion);
	}

	// Region pass manager support.
	template <class Tr>
	typename Tr::RegionT RegionInfoBase<Tr>::getRegionFor(BlockT BB) const {
	typename BBtoRegionMap::const_iterator I = BBtoRegion.find(BB);
	return I != BBtoRegion.end() ? I->second : nullptr;
	}

	template <class Tr>
	void RegionInfoBase<Tr>::setRegionFor(BlockT BB, RegionT R) {
	BBtoRegion[BB] = R;
	}

	template <class Tr>
	typename Tr::RegionT RegionInfoBase<Tr>::operator[](BlockT BB) const {
	return getRegionFor(BB);
	}

	template <class Tr>
	typename RegionInfoBase<Tr>::BlockT *
	RegionInfoBase<Tr>::getMaxRegionExit(BlockT *BB) const {
	BlockT *Exit = nullptr;

	while (true) {
	// Get largest region that starts at BB.
	RegionT *R = getRegionFor(BB);
	while (R && R->getParent() && R->getParent()->getEntry() == BB)
	R = R->getParent();

	// Get the single exit of BB.
	if (R && R->getEntry() == BB)
	Exit = R->getExit();
	else if (++BlockTraits::child_begin(BB) == BlockTraits::child_end(BB))
	Exit = *BlockTraits::child_begin(BB);
	else // No single exit exists.
	return Exit;

	// Get largest region that starts at Exit.
	RegionT *ExitR = getRegionFor(Exit);
	while (ExitR && ExitR->getParent() &&
	ExitR->getParent()->getEntry() == Exit)
	ExitR = ExitR->getParent();

	for (BlockT *Pred : make_range(InvBlockTraits::child_begin(Exit),
	InvBlockTraits::child_end(Exit))) {
	if (!R->contains(Pred) && !ExitR->contains(Pred))
	break;
	}

	// This stops infinite cycles.
	if (DT->dominates(Exit, BB))
	break;

	BB = Exit;
	}

	return Exit;
	}

	template <class Tr>
	typename Tr::RegionT RegionInfoBase<Tr>::getCommonRegion(RegionT A,
	RegionT *B) const {
	assert(A && B && "One of the Regions is NULL");

	if (A->contains(B))
	return A;

	while (!B->contains(A))
	B = B->getParent();

	return B;
	}

	template <class Tr>
	typename Tr::RegionT *
	RegionInfoBase<Tr>::getCommonRegion(SmallVectorImpl<RegionT *> &Regions) const {
	RegionT *ret = Regions.back();
	Regions.pop_back();

	for (RegionT *R : Regions)
	ret = getCommonRegion(ret, R);

	return ret;
	}

	template <class Tr>
	typename Tr::RegionT *
	RegionInfoBase<Tr>::getCommonRegion(SmallVectorImpl<BlockT *> &BBs) const {
	RegionT *ret = getRegionFor(BBs.back());
	BBs.pop_back();

	for (BlockT *BB : BBs)
	ret = getCommonRegion(ret, getRegionFor(BB));

	return ret;
	}

	template <class Tr>
	void RegionInfoBase<Tr>::calculate(FuncT &F) {
	using FuncPtrT = typename std::add_pointer<FuncT>::type;

	// ShortCut a function where for every BB the exit of the largest region
	// starting with BB is stored. These regions can be threated as single BBS.
	// This improves performance on linear CFGs.
	BBtoBBMap ShortCut;

	scanForRegions(F, &ShortCut);
	BlockT *BB = GraphTraits<FuncPtrT>::getEntryNode(&F);
	buildRegionsTree(DT->getNode(BB), TopLevelRegion);
	}

	} // end namespace llvm

	#undef DEBUG_TYPE

	#endif // LLVM_ANALYSIS_REGIONINFOIMPL_H
	Index: vendor/llvm/dist-release_60/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
	===================================================================
	--- vendor/llvm/dist-release_60/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h (revision 328361)
	+++ vendor/llvm/dist-release_60/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h (revision 328362)
	@@ -1,64 +1,64 @@
	//===- SelectionDAGAddressAnalysis.h - DAG Address Analysis ------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
	#define LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H

	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include <cstdint>

	namespace llvm {

	class SelectionDAG;

	/// Helper struct to parse and store a memory address as base + index + offset.
	/// We ignore sign extensions when it is safe to do so.
	/// The following two expressions are not equivalent. To differentiate we need
	/// to store whether there was a sign extension involved in the index
	/// computation.
	/// (load (i64 add (i64 copyfromreg %c)
	/// (i64 signextend (add (i8 load %index)
	/// (i8 1))))
	/// vs
	///
	/// (load (i64 add (i64 copyfromreg %c)
	/// (i64 signextend (i32 add (i32 signextend (i8 load %index))
	/// (i32 1)))))
	class BaseIndexOffset {
	private:
	SDValue Base;
	SDValue Index;
	int64_t Offset = 0;
	bool IsIndexSignExt = false;

	public:
	BaseIndexOffset() = default;
	BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset,
	bool IsIndexSignExt)
	: Base(Base), Index(Index), Offset(Offset),
	IsIndexSignExt(IsIndexSignExt) {}

	SDValue getBase() { return Base; }
	SDValue getIndex() { return Index; }

	bool equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG) {
	int64_t Off;
	return equalBaseIndex(Other, DAG, Off);
	}

	bool equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG,
	int64_t &Off);

	/// Parses tree in Ptr for base, index, offset addresses.
	- static BaseIndexOffset match(SDValue Ptr, const SelectionDAG &DAG);
	+ static BaseIndexOffset match(LSBaseSDNode *N, const SelectionDAG &DAG);
	};

	} // end namespace llvm

	#endif // LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
	Index: vendor/llvm/dist-release_60/include/llvm/MC/MCCodeView.h
	===================================================================
	--- vendor/llvm/dist-release_60/include/llvm/MC/MCCodeView.h (revision 328361)
	+++ vendor/llvm/dist-release_60/include/llvm/MC/MCCodeView.h (revision 328362)
	@@ -1,335 +1,301 @@
	//===- MCCodeView.h - Machine Code CodeView support -------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Holds state from .cv_file and .cv_loc directives for later emission.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_MC_MCCODEVIEW_H
	#define LLVM_MC_MCCODEVIEW_H

	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/MC/MCFragment.h"
	#include "llvm/MC/MCObjectStreamer.h"
	#include <map>
	#include <vector>

	namespace llvm {
	class MCContext;
	class MCObjectStreamer;
	class MCStreamer;
	class CodeViewContext;

	/// \brief Instances of this class represent the information from a
	/// .cv_loc directive.
	class MCCVLoc {
	uint32_t FunctionId;
	uint32_t FileNum;
	uint32_t Line;
	uint16_t Column;
	uint16_t PrologueEnd : 1;
	uint16_t IsStmt : 1;

	private: // CodeViewContext manages these
	friend class CodeViewContext;
	MCCVLoc(unsigned functionid, unsigned fileNum, unsigned line, unsigned column,
	bool prologueend, bool isstmt)
	: FunctionId(functionid), FileNum(fileNum), Line(line), Column(column),
	PrologueEnd(prologueend), IsStmt(isstmt) {}

	// Allow the default copy constructor and assignment operator to be used
	// for an MCCVLoc object.

	public:
	unsigned getFunctionId() const { return FunctionId; }

	/// \brief Get the FileNum of this MCCVLoc.
	unsigned getFileNum() const { return FileNum; }

	/// \brief Get the Line of this MCCVLoc.
	unsigned getLine() const { return Line; }

	/// \brief Get the Column of this MCCVLoc.
	unsigned getColumn() const { return Column; }

	bool isPrologueEnd() const { return PrologueEnd; }
	bool isStmt() const { return IsStmt; }

	void setFunctionId(unsigned FID) { FunctionId = FID; }

	/// \brief Set the FileNum of this MCCVLoc.
	void setFileNum(unsigned fileNum) { FileNum = fileNum; }

	/// \brief Set the Line of this MCCVLoc.
	void setLine(unsigned line) { Line = line; }

	/// \brief Set the Column of this MCCVLoc.
	void setColumn(unsigned column) {
	assert(column <= UINT16_MAX);
	Column = column;
	}

	void setPrologueEnd(bool PE) { PrologueEnd = PE; }
	void setIsStmt(bool IS) { IsStmt = IS; }
	};

	/// \brief Instances of this class represent the line information for
	/// the CodeView line table entries. Which is created after a machine
	/// instruction is assembled and uses an address from a temporary label
	/// created at the current address in the current section and the info from
	/// the last .cv_loc directive seen as stored in the context.
	class MCCVLineEntry : public MCCVLoc {
	const MCSymbol *Label;

	private:
	// Allow the default copy constructor and assignment operator to be used
	// for an MCCVLineEntry object.

	public:
	// Constructor to create an MCCVLineEntry given a symbol and the dwarf loc.
	MCCVLineEntry(const MCSymbol *Label, const MCCVLoc loc)
	: MCCVLoc(loc), Label(Label) {}

	const MCSymbol *getLabel() const { return Label; }

	// This is called when an instruction is assembled into the specified
	// section and if there is information from the last .cv_loc directive that
	// has yet to have a line entry made for it is made.
	static void Make(MCObjectStreamer *MCOS);
	};

	/// Information describing a function or inlined call site introduced by
	/// .cv_func_id or .cv_inline_site_id. Accumulates information from .cv_loc
	/// directives used with this function's id or the id of an inlined call site
	/// within this function or inlined call site.
	struct MCCVFunctionInfo {
	/// If this represents an inlined call site, then ParentFuncIdPlusOne will be
	/// the parent function id plus one. If this represents a normal function,
	/// then there is no parent, and ParentFuncIdPlusOne will be FunctionSentinel.
	/// If this struct is an unallocated slot in the function info vector, then
	/// ParentFuncIdPlusOne will be zero.
	unsigned ParentFuncIdPlusOne = 0;

	enum : unsigned { FunctionSentinel = ~0U };

	struct LineInfo {
	unsigned File;
	unsigned Line;
	unsigned Col;
	};

	LineInfo InlinedAt;

	/// The section of the first .cv_loc directive used for this function, or null
	/// if none has been seen yet.
	MCSection *Section = nullptr;

	/// Map from inlined call site id to the inlined at location to use for that
	/// call site. Call chains are collapsed, so for the call chain 'f -> g -> h',
	/// the InlinedAtMap of 'f' will contain entries for 'g' and 'h' that both
	/// list the line info for the 'g' call site.
	DenseMap<unsigned, LineInfo> InlinedAtMap;

	/// Returns true if this is function info has not yet been used in a
	/// .cv_func_id or .cv_inline_site_id directive.
	bool isUnallocatedFunctionInfo() const { return ParentFuncIdPlusOne == 0; }

	/// Returns true if this represents an inlined call site, meaning
	/// ParentFuncIdPlusOne is neither zero nor ~0U.
	bool isInlinedCallSite() const {
	return !isUnallocatedFunctionInfo() &&
	ParentFuncIdPlusOne != FunctionSentinel;
	}

	unsigned getParentFuncId() const {
	assert(isInlinedCallSite());
	return ParentFuncIdPlusOne - 1;
	}
	};

	/// Holds state from .cv_file and .cv_loc directives for later emission.
	class CodeViewContext {
	public:
	CodeViewContext();
	~CodeViewContext();

	bool isValidFileNumber(unsigned FileNumber) const;
	bool addFile(MCStreamer &OS, unsigned FileNumber, StringRef Filename,
	ArrayRef<uint8_t> ChecksumBytes, uint8_t ChecksumKind);

	/// Records the function id of a normal function. Returns false if the
	/// function id has already been used, and true otherwise.
	bool recordFunctionId(unsigned FuncId);

	/// Records the function id of an inlined call site. Records the "inlined at"
	/// location info of the call site, including what function or inlined call
	/// site it was inlined into. Returns false if the function id has already
	/// been used, and true otherwise.
	bool recordInlinedCallSiteId(unsigned FuncId, unsigned IAFunc,
	unsigned IAFile, unsigned IALine,
	unsigned IACol);

	/// Retreive the function info if this is a valid function id, or nullptr.
	- MCCVFunctionInfo *getCVFunctionInfo(unsigned FuncId) {
	- if (FuncId >= Functions.size())
	- return nullptr;
	- if (Functions[FuncId].isUnallocatedFunctionInfo())
	- return nullptr;
	- return &Functions[FuncId];
	- }
	+ MCCVFunctionInfo *getCVFunctionInfo(unsigned FuncId);

	/// Saves the information from the currently parsed .cv_loc directive
	/// and sets CVLocSeen. When the next instruction is assembled an entry
	/// in the line number table with this information and the address of the
	/// instruction will be created.
	void setCurrentCVLoc(unsigned FunctionId, unsigned FileNo, unsigned Line,
	unsigned Column, bool PrologueEnd, bool IsStmt) {
	CurrentCVLoc.setFunctionId(FunctionId);
	CurrentCVLoc.setFileNum(FileNo);
	CurrentCVLoc.setLine(Line);
	CurrentCVLoc.setColumn(Column);
	CurrentCVLoc.setPrologueEnd(PrologueEnd);
	CurrentCVLoc.setIsStmt(IsStmt);
	CVLocSeen = true;
	}
	- void clearCVLocSeen() { CVLocSeen = false; }

	bool getCVLocSeen() { return CVLocSeen; }
	+ void clearCVLocSeen() { CVLocSeen = false; }
	+
	const MCCVLoc &getCurrentCVLoc() { return CurrentCVLoc; }

	bool isValidCVFileNumber(unsigned FileNumber);

	/// \brief Add a line entry.
	- void addLineEntry(const MCCVLineEntry &LineEntry) {
	- size_t Offset = MCCVLines.size();
	- auto I = MCCVLineStartStop.insert(
	- {LineEntry.getFunctionId(), {Offset, Offset + 1}});
	- if (!I.second)
	- I.first->second.second = Offset + 1;
	- MCCVLines.push_back(LineEntry);
	- }
	+ void addLineEntry(const MCCVLineEntry &LineEntry);

	- std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId) {
	- std::vector<MCCVLineEntry> FilteredLines;
	+ std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId);

	- auto I = MCCVLineStartStop.find(FuncId);
	- if (I != MCCVLineStartStop.end())
	- for (size_t Idx = I->second.first, End = I->second.second; Idx != End;
	- ++Idx)
	- if (MCCVLines[Idx].getFunctionId() == FuncId)
	- FilteredLines.push_back(MCCVLines[Idx]);
	- return FilteredLines;
	- }
	+ std::pair<size_t, size_t> getLineExtent(unsigned FuncId);

	- std::pair<size_t, size_t> getLineExtent(unsigned FuncId) {
	- auto I = MCCVLineStartStop.find(FuncId);
	- // Return an empty extent if there are no cv_locs for this function id.
	- if (I == MCCVLineStartStop.end())
	- return {~0ULL, 0};
	- return I->second;
	- }
	-
	- ArrayRef<MCCVLineEntry> getLinesForExtent(size_t L, size_t R) {
	- if (R <= L)
	- return None;
	- if (L >= MCCVLines.size())
	- return None;
	- return makeArrayRef(&MCCVLines[L], R - L);
	- }
	+ ArrayRef<MCCVLineEntry> getLinesForExtent(size_t L, size_t R);

	/// Emits a line table substream.
	void emitLineTableForFunction(MCObjectStreamer &OS, unsigned FuncId,
	const MCSymbol *FuncBegin,
	const MCSymbol *FuncEnd);

	void emitInlineLineTableForFunction(MCObjectStreamer &OS,
	unsigned PrimaryFunctionId,
	unsigned SourceFileId,
	unsigned SourceLineNum,
	const MCSymbol *FnStartSym,
	const MCSymbol *FnEndSym);

	/// Encodes the binary annotations once we have a layout.
	void encodeInlineLineTable(MCAsmLayout &Layout,
	MCCVInlineLineTableFragment &F);

	void
	emitDefRange(MCObjectStreamer &OS,
	ArrayRef<std::pair<const MCSymbol , const MCSymbol >> Ranges,
	StringRef FixedSizePortion);

	void encodeDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &F);

	/// Emits the string table substream.
	void emitStringTable(MCObjectStreamer &OS);

	/// Emits the file checksum substream.
	void emitFileChecksums(MCObjectStreamer &OS);

	/// Emits the offset into the checksum table of the given file number.
	void emitFileChecksumOffset(MCObjectStreamer &OS, unsigned FileNo);

	/// Add something to the string table. Returns the final string as well as
	/// offset into the string table.
	std::pair<StringRef, unsigned> addToStringTable(StringRef S);

	private:
	/// The current CodeView line information from the last .cv_loc directive.
	MCCVLoc CurrentCVLoc = MCCVLoc(0, 0, 0, 0, false, true);
	bool CVLocSeen = false;

	/// Map from string to string table offset.
	StringMap<unsigned> StringTable;

	/// The fragment that ultimately holds our strings.
	MCDataFragment *StrTabFragment = nullptr;
	bool InsertedStrTabFragment = false;

	MCDataFragment *getStringTableFragment();

	/// Get a string table offset.
	unsigned getStringTableOffset(StringRef S);

	struct FileInfo {
	unsigned StringTableOffset;

	// Indicates if this FileInfo corresponds to an actual file, or hasn't been
	// set yet.
	bool Assigned = false;

	uint8_t ChecksumKind;

	ArrayRef<uint8_t> Checksum;

	// Checksum offset stored as a symbol because it might be requested
	// before it has been calculated, so a fixup may be needed.
	MCSymbol *ChecksumTableOffset;
	};

	/// Array storing added file information.
	SmallVector<FileInfo, 4> Files;

	/// The offset of the first and last .cv_loc directive for a given function
	/// id.
	std::map<unsigned, std::pair<size_t, size_t>> MCCVLineStartStop;

	/// A collection of MCCVLineEntry for each section.
	std::vector<MCCVLineEntry> MCCVLines;

	/// All known functions and inlined call sites, indexed by function id.
	std::vector<MCCVFunctionInfo> Functions;

	/// Indicate whether we have already laid out the checksum table addresses or
	/// not.
	bool ChecksumOffsetsAssigned = false;
	};

	} // end namespace llvm
	#endif
	Index: vendor/llvm/dist-release_60/include/llvm/Support/GenericDomTreeConstruction.h
	===================================================================
	--- vendor/llvm/dist-release_60/include/llvm/Support/GenericDomTreeConstruction.h (revision 328361)
	+++ vendor/llvm/dist-release_60/include/llvm/Support/GenericDomTreeConstruction.h (revision 328362)
	@@ -1,1610 +1,1630 @@
	//===- GenericDomTreeConstruction.h - Dominator Calculation ------- C++ --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	///
	/// Generic dominator tree construction - This file provides routines to
	/// construct immediate dominator information for a flow-graph based on the
	/// Semi-NCA algorithm described in this dissertation:
	///
	/// Linear-Time Algorithms for Dominators and Related Problems
	/// Loukas Georgiadis, Princeton University, November 2005, pp. 21-23:
	/// ftp://ftp.cs.princeton.edu/reports/2005/737.pdf
	///
	/// This implements the O(n*log(n)) versions of EVAL and LINK, because it turns
	/// out that the theoretically slower O(n*log(n)) implementation is actually
	/// faster than the almost-linear O(n*alpha(n)) version, even for large CFGs.
	///
	/// The file uses the Depth Based Search algorithm to perform incremental
	/// updates (insertion and deletions). The implemented algorithm is based on
	/// this publication:
	///
	/// An Experimental Study of Dynamic Dominators
	/// Loukas Georgiadis, et al., April 12 2016, pp. 5-7, 9-10:
	/// https://arxiv.org/pdf/1604.02711.pdf
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
	#define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H

	#include <queue>
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/GenericDomTree.h"

	#define DEBUG_TYPE "dom-tree-builder"

	namespace llvm {
	namespace DomTreeBuilder {

	template <typename DomTreeT>
	struct SemiNCAInfo {
	using NodePtr = typename DomTreeT::NodePtr;
	using NodeT = typename DomTreeT::NodeType;
	using TreeNodePtr = DomTreeNodeBase<NodeT> *;
	using RootsT = decltype(DomTreeT::Roots);
	static constexpr bool IsPostDom = DomTreeT::IsPostDominator;

	// Information record used by Semi-NCA during tree construction.
	struct InfoRec {
	unsigned DFSNum = 0;
	unsigned Parent = 0;
	unsigned Semi = 0;
	NodePtr Label = nullptr;
	NodePtr IDom = nullptr;
	SmallVector<NodePtr, 2> ReverseChildren;
	};

	// Number to node mapping is 1-based. Initialize the mapping to start with
	// a dummy element.
	std::vector<NodePtr> NumToNode = {nullptr};
	DenseMap<NodePtr, InfoRec> NodeToInfo;

	using UpdateT = typename DomTreeT::UpdateType;
	struct BatchUpdateInfo {
	SmallVector<UpdateT, 4> Updates;
	using NodePtrAndKind = PointerIntPair<NodePtr, 1, UpdateKind>;

	// In order to be able to walk a CFG that is out of sync with the CFG
	// DominatorTree last knew about, use the list of updates to reconstruct
	// previous CFG versions of the current CFG. For each node, we store a set
	// of its virtually added/deleted future successors and predecessors.
	// Note that these children are from the future relative to what the
	// DominatorTree knows about -- using them to gets us some snapshot of the
	// CFG from the past (relative to the state of the CFG).
	DenseMap<NodePtr, SmallDenseSet<NodePtrAndKind, 4>> FutureSuccessors;
	DenseMap<NodePtr, SmallDenseSet<NodePtrAndKind, 4>> FuturePredecessors;
	// Remembers if the whole tree was recalculated at some point during the
	// current batch update.
	bool IsRecalculated = false;
	};

	BatchUpdateInfo *BatchUpdates;
	using BatchUpdatePtr = BatchUpdateInfo *;

	// If BUI is a nullptr, then there's no batch update in progress.
	SemiNCAInfo(BatchUpdatePtr BUI) : BatchUpdates(BUI) {}

	void clear() {
	NumToNode = {nullptr}; // Restore to initial state with a dummy start node.
	NodeToInfo.clear();
	// Don't reset the pointer to BatchUpdateInfo here -- if there's an update
	// in progress, we need this information to continue it.
	}

	template <bool Inverse>
	struct ChildrenGetter {
	using ResultTy = SmallVector<NodePtr, 8>;

	static ResultTy Get(NodePtr N, std::integral_constant<bool, false>) {
	auto RChildren = reverse(children<NodePtr>(N));
	return ResultTy(RChildren.begin(), RChildren.end());
	}

	static ResultTy Get(NodePtr N, std::integral_constant<bool, true>) {
	auto IChildren = inverse_children<NodePtr>(N);
	return ResultTy(IChildren.begin(), IChildren.end());
	}

	using Tag = std::integral_constant<bool, Inverse>;

	// The function below is the core part of the batch updater. It allows the
	// Depth Based Search algorithm to perform incremental updates in lockstep
	// with updates to the CFG. We emulated lockstep CFG updates by getting its
	// next snapshots by reverse-applying future updates.
	static ResultTy Get(NodePtr N, BatchUpdatePtr BUI) {
	ResultTy Res = Get(N, Tag());
	// If there's no batch update in progress, simply return node's children.
	if (!BUI) return Res;

	// CFG children are actually its most current children, and we have to
	// reverse-apply the future updates to get the node's children at the
	// point in time the update was performed.
	auto &FutureChildren = (Inverse != IsPostDom) ? BUI->FuturePredecessors
	: BUI->FutureSuccessors;
	auto FCIt = FutureChildren.find(N);
	if (FCIt == FutureChildren.end()) return Res;

	for (auto ChildAndKind : FCIt->second) {
	const NodePtr Child = ChildAndKind.getPointer();
	const UpdateKind UK = ChildAndKind.getInt();

	// Reverse-apply the future update.
	if (UK == UpdateKind::Insert) {
	// If there's an insertion in the future, it means that the edge must
	// exist in the current CFG, but was not present in it before.
	assert(llvm::find(Res, Child) != Res.end()
	&& "Expected child not found in the CFG");
	Res.erase(std::remove(Res.begin(), Res.end(), Child), Res.end());
	DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> "
	<< BlockNamePrinter(Child) << "\n");
	} else {
	// If there's an deletion in the future, it means that the edge cannot
	// exist in the current CFG, but existed in it before.
	assert(llvm::find(Res, Child) == Res.end() &&
	"Unexpected child found in the CFG");
	DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N)
	<< " -> " << BlockNamePrinter(Child) << "\n");
	Res.push_back(Child);
	}
	}

	return Res;
	}
	};

	NodePtr getIDom(NodePtr BB) const {
	auto InfoIt = NodeToInfo.find(BB);
	if (InfoIt == NodeToInfo.end()) return nullptr;

	return InfoIt->second.IDom;
	}

	TreeNodePtr getNodeForBlock(NodePtr BB, DomTreeT &DT) {
	if (TreeNodePtr Node = DT.getNode(BB)) return Node;

	// Haven't calculated this node yet? Get or calculate the node for the
	// immediate dominator.
	NodePtr IDom = getIDom(BB);

	assert(IDom \|\| DT.DomTreeNodes[nullptr]);
	TreeNodePtr IDomNode = getNodeForBlock(IDom, DT);

	// Add a new tree node for this NodeT, and link it as a child of
	// IDomNode
	return (DT.DomTreeNodes[BB] = IDomNode->addChild(
	llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
	.get();
	}

	static bool AlwaysDescend(NodePtr, NodePtr) { return true; }

	struct BlockNamePrinter {
	NodePtr N;

	BlockNamePrinter(NodePtr Block) : N(Block) {}
	BlockNamePrinter(TreeNodePtr TN) : N(TN ? TN->getBlock() : nullptr) {}

	friend raw_ostream &operator<<(raw_ostream &O, const BlockNamePrinter &BP) {
	if (!BP.N)
	O << "nullptr";
	else
	BP.N->printAsOperand(O, false);

	return O;
	}
	};

	// Custom DFS implementation which can skip nodes based on a provided
	// predicate. It also collects ReverseChildren so that we don't have to spend
	// time getting predecessors in SemiNCA.
	//
	// If IsReverse is set to true, the DFS walk will be performed backwards
	// relative to IsPostDom -- using reverse edges for dominators and forward
	// edges for postdominators.
	template <bool IsReverse = false, typename DescendCondition>
	unsigned runDFS(NodePtr V, unsigned LastNum, DescendCondition Condition,
	unsigned AttachToNum) {
	assert(V);
	SmallVector<NodePtr, 64> WorkList = {V};
	if (NodeToInfo.count(V) != 0) NodeToInfo[V].Parent = AttachToNum;

	while (!WorkList.empty()) {
	const NodePtr BB = WorkList.pop_back_val();
	auto &BBInfo = NodeToInfo[BB];

	// Visited nodes always have positive DFS numbers.
	if (BBInfo.DFSNum != 0) continue;
	BBInfo.DFSNum = BBInfo.Semi = ++LastNum;
	BBInfo.Label = BB;
	NumToNode.push_back(BB);

	constexpr bool Direction = IsReverse != IsPostDom; // XOR.
	for (const NodePtr Succ :
	ChildrenGetter<Direction>::Get(BB, BatchUpdates)) {
	const auto SIT = NodeToInfo.find(Succ);
	// Don't visit nodes more than once but remember to collect
	// ReverseChildren.
	if (SIT != NodeToInfo.end() && SIT->second.DFSNum != 0) {
	if (Succ != BB) SIT->second.ReverseChildren.push_back(BB);
	continue;
	}

	if (!Condition(BB, Succ)) continue;

	// It's fine to add Succ to the map, because we know that it will be
	// visited later.
	auto &SuccInfo = NodeToInfo[Succ];
	WorkList.push_back(Succ);
	SuccInfo.Parent = LastNum;
	SuccInfo.ReverseChildren.push_back(BB);
	}
	}

	return LastNum;
	}

	NodePtr eval(NodePtr VIn, unsigned LastLinked) {
	auto &VInInfo = NodeToInfo[VIn];
	if (VInInfo.DFSNum < LastLinked)
	return VIn;

	SmallVector<NodePtr, 32> Work;
	SmallPtrSet<NodePtr, 32> Visited;

	if (VInInfo.Parent >= LastLinked)
	Work.push_back(VIn);

	while (!Work.empty()) {
	NodePtr V = Work.back();
	auto &VInfo = NodeToInfo[V];
	NodePtr VAncestor = NumToNode[VInfo.Parent];

	// Process Ancestor first
	if (Visited.insert(VAncestor).second && VInfo.Parent >= LastLinked) {
	Work.push_back(VAncestor);
	continue;
	}
	Work.pop_back();

	// Update VInfo based on Ancestor info
	if (VInfo.Parent < LastLinked)
	continue;

	auto &VAInfo = NodeToInfo[VAncestor];
	NodePtr VAncestorLabel = VAInfo.Label;
	NodePtr VLabel = VInfo.Label;
	if (NodeToInfo[VAncestorLabel].Semi < NodeToInfo[VLabel].Semi)
	VInfo.Label = VAncestorLabel;
	VInfo.Parent = VAInfo.Parent;
	}

	return VInInfo.Label;
	}

	// This function requires DFS to be run before calling it.
	void runSemiNCA(DomTreeT &DT, const unsigned MinLevel = 0) {
	const unsigned NextDFSNum(NumToNode.size());
	// Initialize IDoms to spanning tree parents.
	for (unsigned i = 1; i < NextDFSNum; ++i) {
	const NodePtr V = NumToNode[i];
	auto &VInfo = NodeToInfo[V];
	VInfo.IDom = NumToNode[VInfo.Parent];
	}

	// Step #1: Calculate the semidominators of all vertices.
	for (unsigned i = NextDFSNum - 1; i >= 2; --i) {
	NodePtr W = NumToNode[i];
	auto &WInfo = NodeToInfo[W];

	// Initialize the semi dominator to point to the parent node.
	WInfo.Semi = WInfo.Parent;
	for (const auto &N : WInfo.ReverseChildren) {
	if (NodeToInfo.count(N) == 0) // Skip unreachable predecessors.
	continue;

	const TreeNodePtr TN = DT.getNode(N);
	// Skip predecessors whose level is above the subtree we are processing.
	if (TN && TN->getLevel() < MinLevel)
	continue;

	unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi;
	if (SemiU < WInfo.Semi) WInfo.Semi = SemiU;
	}
	}

	// Step #2: Explicitly define the immediate dominator of each vertex.
	// IDom[i] = NCA(SDom[i], SpanningTreeParent(i)).
	// Note that the parents were stored in IDoms and later got invalidated
	// during path compression in Eval.
	for (unsigned i = 2; i < NextDFSNum; ++i) {
	const NodePtr W = NumToNode[i];
	auto &WInfo = NodeToInfo[W];
	const unsigned SDomNum = NodeToInfo[NumToNode[WInfo.Semi]].DFSNum;
	NodePtr WIDomCandidate = WInfo.IDom;
	while (NodeToInfo[WIDomCandidate].DFSNum > SDomNum)
	WIDomCandidate = NodeToInfo[WIDomCandidate].IDom;

	WInfo.IDom = WIDomCandidate;
	}
	}

	// PostDominatorTree always has a virtual root that represents a virtual CFG
	// node that serves as a single exit from the function. All the other exits
	// (CFG nodes with terminators and nodes in infinite loops are logically
	// connected to this virtual CFG exit node).
	// This functions maps a nullptr CFG node to the virtual root tree node.
	void addVirtualRoot() {
	assert(IsPostDom && "Only postdominators have a virtual root");
	assert(NumToNode.size() == 1 && "SNCAInfo must be freshly constructed");

	auto &BBInfo = NodeToInfo[nullptr];
	BBInfo.DFSNum = BBInfo.Semi = 1;
	BBInfo.Label = nullptr;

	NumToNode.push_back(nullptr); // NumToNode[1] = nullptr;
	}

	// For postdominators, nodes with no forward successors are trivial roots that
	// are always selected as tree roots. Roots with forward successors correspond
	// to CFG nodes within infinite loops.
	static bool HasForwardSuccessors(const NodePtr N, BatchUpdatePtr BUI) {
	assert(N && "N must be a valid node");
	return !ChildrenGetter<false>::Get(N, BUI).empty();
	}

	static NodePtr GetEntryNode(const DomTreeT &DT) {
	assert(DT.Parent && "Parent not set");
	return GraphTraits<typename DomTreeT::ParentPtr>::getEntryNode(DT.Parent);
	}

	// Finds all roots without relaying on the set of roots already stored in the
	// tree.
	// We define roots to be some non-redundant set of the CFG nodes
	static RootsT FindRoots(const DomTreeT &DT, BatchUpdatePtr BUI) {
	assert(DT.Parent && "Parent pointer is not set");
	RootsT Roots;

	// For dominators, function entry CFG node is always a tree root node.
	if (!IsPostDom) {
	Roots.push_back(GetEntryNode(DT));
	return Roots;
	}

	SemiNCAInfo SNCA(BUI);

	// PostDominatorTree always has a virtual root.
	SNCA.addVirtualRoot();
	unsigned Num = 1;

	DEBUG(dbgs() << "\t\tLooking for trivial roots\n");

	// Step #1: Find all the trivial roots that are going to will definitely
	// remain tree roots.
	unsigned Total = 0;
	// It may happen that there are some new nodes in the CFG that are result of
	// the ongoing batch update, but we cannot really pretend that they don't
	// exist -- we won't see any outgoing or incoming edges to them, so it's
	// fine to discover them here, as they would end up appearing in the CFG at
	// some point anyway.
	for (const NodePtr N : nodes(DT.Parent)) {
	++Total;
	// If it has no successors, it is definitely a root.
	if (!HasForwardSuccessors(N, BUI)) {
	Roots.push_back(N);
	// Run DFS not to walk this part of CFG later.
	Num = SNCA.runDFS(N, Num, AlwaysDescend, 1);
	DEBUG(dbgs() << "Found a new trivial root: " << BlockNamePrinter(N)
	<< "\n");
	DEBUG(dbgs() << "Last visited node: "
	<< BlockNamePrinter(SNCA.NumToNode[Num]) << "\n");
	}
	}

	DEBUG(dbgs() << "\t\tLooking for non-trivial roots\n");

	// Step #2: Find all non-trivial root candidates. Those are CFG nodes that
	// are reverse-unreachable were not visited by previous DFS walks (i.e. CFG
	// nodes in infinite loops).
	bool HasNonTrivialRoots = false;
	// Accounting for the virtual exit, see if we had any reverse-unreachable
	// nodes.
	if (Total + 1 != Num) {
	HasNonTrivialRoots = true;
	// Make another DFS pass over all other nodes to find the
	// reverse-unreachable blocks, and find the furthest paths we'll be able
	// to make.
	// Note that this looks N^2, but it's really 2N worst case, if every node
	// is unreachable. This is because we are still going to only visit each
	// unreachable node once, we may just visit it in two directions,
	// depending on how lucky we get.
	SmallPtrSet<NodePtr, 4> ConnectToExitBlock;
	for (const NodePtr I : nodes(DT.Parent)) {
	if (SNCA.NodeToInfo.count(I) == 0) {
	DEBUG(dbgs() << "\t\t\tVisiting node " << BlockNamePrinter(I)
	<< "\n");
	// Find the furthest away we can get by following successors, then
	// follow them in reverse. This gives us some reasonable answer about
	// the post-dom tree inside any infinite loop. In particular, it
	// guarantees we get to the farthest away point along some
	// path. This also matches the GCC's behavior.
	// If we really wanted a totally complete picture of dominance inside
	// this infinite loop, we could do it with SCC-like algorithms to find
	// the lowest and highest points in the infinite loop. In theory, it
	// would be nice to give the canonical backedge for the loop, but it's
	// expensive and does not always lead to a minimal set of roots.
	DEBUG(dbgs() << "\t\t\tRunning forward DFS\n");

	const unsigned NewNum = SNCA.runDFS<true>(I, Num, AlwaysDescend, Num);
	const NodePtr FurthestAway = SNCA.NumToNode[NewNum];
	DEBUG(dbgs() << "\t\t\tFound a new furthest away node "
	<< "(non-trivial root): "
	<< BlockNamePrinter(FurthestAway) << "\n");
	ConnectToExitBlock.insert(FurthestAway);
	Roots.push_back(FurthestAway);
	DEBUG(dbgs() << "\t\t\tPrev DFSNum: " << Num << ", new DFSNum: "
	<< NewNum << "\n\t\t\tRemoving DFS info\n");
	for (unsigned i = NewNum; i > Num; --i) {
	const NodePtr N = SNCA.NumToNode[i];
	DEBUG(dbgs() << "\t\t\t\tRemoving DFS info for "
	<< BlockNamePrinter(N) << "\n");
	SNCA.NodeToInfo.erase(N);
	SNCA.NumToNode.pop_back();
	}
	const unsigned PrevNum = Num;
	DEBUG(dbgs() << "\t\t\tRunning reverse DFS\n");
	Num = SNCA.runDFS(FurthestAway, Num, AlwaysDescend, 1);
	for (unsigned i = PrevNum + 1; i <= Num; ++i)
	DEBUG(dbgs() << "\t\t\t\tfound node "
	<< BlockNamePrinter(SNCA.NumToNode[i]) << "\n");
	}
	}
	}

	DEBUG(dbgs() << "Total: " << Total << ", Num: " << Num << "\n");
	DEBUG(dbgs() << "Discovered CFG nodes:\n");
	DEBUG(for (size_t i = 0; i <= Num; ++i) dbgs()
	<< i << ": " << BlockNamePrinter(SNCA.NumToNode[i]) << "\n");

	assert((Total + 1 == Num) && "Everything should have been visited");

	// Step #3: If we found some non-trivial roots, make them non-redundant.
	if (HasNonTrivialRoots) RemoveRedundantRoots(DT, BUI, Roots);

	DEBUG(dbgs() << "Found roots: ");
	DEBUG(for (auto *Root : Roots) dbgs() << BlockNamePrinter(Root) << " ");
	DEBUG(dbgs() << "\n");

	return Roots;
	}

	// This function only makes sense for postdominators.
	// We define roots to be some set of CFG nodes where (reverse) DFS walks have
	// to start in order to visit all the CFG nodes (including the
	// reverse-unreachable ones).
	// When the search for non-trivial roots is done it may happen that some of
	// the non-trivial roots are reverse-reachable from other non-trivial roots,
	// which makes them redundant. This function removes them from the set of
	// input roots.
	static void RemoveRedundantRoots(const DomTreeT &DT, BatchUpdatePtr BUI,
	RootsT &Roots) {
	assert(IsPostDom && "This function is for postdominators only");
	DEBUG(dbgs() << "Removing redundant roots\n");

	SemiNCAInfo SNCA(BUI);

	for (unsigned i = 0; i < Roots.size(); ++i) {
	auto &Root = Roots[i];
	// Trivial roots are always non-redundant.
	if (!HasForwardSuccessors(Root, BUI)) continue;
	DEBUG(dbgs() << "\tChecking if " << BlockNamePrinter(Root)
	<< " remains a root\n");
	SNCA.clear();
	// Do a forward walk looking for the other roots.
	const unsigned Num = SNCA.runDFS<true>(Root, 0, AlwaysDescend, 0);
	// Skip the start node and begin from the second one (note that DFS uses
	// 1-based indexing).
	for (unsigned x = 2; x <= Num; ++x) {
	const NodePtr N = SNCA.NumToNode[x];
	// If we wound another root in a (forward) DFS walk, remove the current
	// root from the set of roots, as it is reverse-reachable from the other
	// one.
	if (llvm::find(Roots, N) != Roots.end()) {
	DEBUG(dbgs() << "\tForward DFS walk found another root "
	<< BlockNamePrinter(N) << "\n\tRemoving root "
	<< BlockNamePrinter(Root) << "\n");
	std::swap(Root, Roots.back());
	Roots.pop_back();

	// Root at the back takes the current root's place.
	// Start the next loop iteration with the same index.
	--i;
	break;
	}
	}
	}
	}

	template <typename DescendCondition>
	void doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) {
	if (!IsPostDom) {
	assert(DT.Roots.size() == 1 && "Dominators should have a singe root");
	runDFS(DT.Roots[0], 0, DC, 0);
	return;
	}

	addVirtualRoot();
	unsigned Num = 1;
	for (const NodePtr Root : DT.Roots) Num = runDFS(Root, Num, DC, 0);
	}

	static void CalculateFromScratch(DomTreeT &DT, BatchUpdatePtr BUI) {
	auto *Parent = DT.Parent;
	DT.reset();
	DT.Parent = Parent;
	SemiNCAInfo SNCA(nullptr); // Since we are rebuilding the whole tree,
	// there's no point doing it incrementally.

	// Step #0: Number blocks in depth-first order and initialize variables used
	// in later stages of the algorithm.
	DT.Roots = FindRoots(DT, nullptr);
	SNCA.doFullDFSWalk(DT, AlwaysDescend);

	SNCA.runSemiNCA(DT);
	if (BUI) {
	BUI->IsRecalculated = true;
	DEBUG(dbgs() << "DomTree recalculated, skipping future batch updates\n");
	}

	if (DT.Roots.empty()) return;

	// Add a node for the root. If the tree is a PostDominatorTree it will be
	// the virtual exit (denoted by (BasicBlock *) nullptr) which postdominates
	// all real exits (including multiple exit blocks, infinite loops).
	NodePtr Root = IsPostDom ? nullptr : DT.Roots[0];

	DT.RootNode = (DT.DomTreeNodes[Root] =
	llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
	.get();
	SNCA.attachNewSubtree(DT, DT.RootNode);
	}

	void attachNewSubtree(DomTreeT& DT, const TreeNodePtr AttachTo) {
	// Attach the first unreachable block to AttachTo.
	NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
	// Loop over all of the discovered blocks in the function...
	for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
	NodePtr W = NumToNode[i];
	DEBUG(dbgs() << "\tdiscovered a new reachable node "
	<< BlockNamePrinter(W) << "\n");

	// Don't replace this with 'count', the insertion side effect is important
	if (DT.DomTreeNodes[W]) continue; // Haven't calculated this node yet?

	NodePtr ImmDom = getIDom(W);

	// Get or calculate the node for the immediate dominator.
	TreeNodePtr IDomNode = getNodeForBlock(ImmDom, DT);

	// Add a new tree node for this BasicBlock, and link it as a child of
	// IDomNode.
	DT.DomTreeNodes[W] = IDomNode->addChild(
	llvm::make_unique<DomTreeNodeBase<NodeT>>(W, IDomNode));
	}
	}

	void reattachExistingSubtree(DomTreeT &DT, const TreeNodePtr AttachTo) {
	NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
	for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
	const NodePtr N = NumToNode[i];
	const TreeNodePtr TN = DT.getNode(N);
	assert(TN);
	const TreeNodePtr NewIDom = DT.getNode(NodeToInfo[N].IDom);
	TN->setIDom(NewIDom);
	}
	}

	// Helper struct used during edge insertions.
	struct InsertionInfo {
	using BucketElementTy = std::pair<unsigned, TreeNodePtr>;
	struct DecreasingLevel {
	bool operator()(const BucketElementTy &First,
	const BucketElementTy &Second) const {
	return First.first > Second.first;
	}
	};

	std::priority_queue<BucketElementTy, SmallVector<BucketElementTy, 8>,
	DecreasingLevel>
	Bucket; // Queue of tree nodes sorted by level in descending order.
	SmallDenseSet<TreeNodePtr, 8> Affected;
	- SmallDenseSet<TreeNodePtr, 8> Visited;
	+ SmallDenseMap<TreeNodePtr, unsigned, 8> Visited;
	SmallVector<TreeNodePtr, 8> AffectedQueue;
	SmallVector<TreeNodePtr, 8> VisitedNotAffectedQueue;
	};

	static void InsertEdge(DomTreeT &DT, const BatchUpdatePtr BUI,
	const NodePtr From, const NodePtr To) {
	assert((From \|\| IsPostDom) &&
	"From has to be a valid CFG node or a virtual root");
	assert(To && "Cannot be a nullptr");
	DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> "
	<< BlockNamePrinter(To) << "\n");
	TreeNodePtr FromTN = DT.getNode(From);

	if (!FromTN) {
	// Ignore edges from unreachable nodes for (forward) dominators.
	if (!IsPostDom) return;

	// The unreachable node becomes a new root -- a tree node for it.
	TreeNodePtr VirtualRoot = DT.getNode(nullptr);
	FromTN =
	(DT.DomTreeNodes[From] = VirtualRoot->addChild(
	llvm::make_unique<DomTreeNodeBase<NodeT>>(From, VirtualRoot)))
	.get();
	DT.Roots.push_back(From);
	}

	DT.DFSInfoValid = false;

	const TreeNodePtr ToTN = DT.getNode(To);
	if (!ToTN)
	InsertUnreachable(DT, BUI, FromTN, To);
	else
	InsertReachable(DT, BUI, FromTN, ToTN);
	}

	// Determines if some existing root becomes reverse-reachable after the
	// insertion. Rebuilds the whole tree if that situation happens.
	static bool UpdateRootsBeforeInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr From,
	const TreeNodePtr To) {
	assert(IsPostDom && "This function is only for postdominators");
	// Destination node is not attached to the virtual root, so it cannot be a
	// root.
	if (!DT.isVirtualRoot(To->getIDom())) return false;

	auto RIt = llvm::find(DT.Roots, To->getBlock());
	if (RIt == DT.Roots.end())
	return false; // To is not a root, nothing to update.

	DEBUG(dbgs() << "\t\tAfter the insertion, " << BlockNamePrinter(To)
	<< " is no longer a root\n\t\tRebuilding the tree!!!\n");

	CalculateFromScratch(DT, BUI);
	return true;
	}

	// Updates the set of roots after insertion or deletion. This ensures that
	// roots are the same when after a series of updates and when the tree would
	// be built from scratch.
	static void UpdateRootsAfterUpdate(DomTreeT &DT, const BatchUpdatePtr BUI) {
	assert(IsPostDom && "This function is only for postdominators");

	// The tree has only trivial roots -- nothing to update.
	if (std::none_of(DT.Roots.begin(), DT.Roots.end(), [BUI](const NodePtr N) {
	return HasForwardSuccessors(N, BUI);
	}))
	return;

	// Recalculate the set of roots.
	DT.Roots = FindRoots(DT, BUI);
	for (const NodePtr R : DT.Roots) {
	const TreeNodePtr TN = DT.getNode(R);
	// A CFG node was selected as a tree root, but the corresponding tree node
	// is not connected to the virtual root. This is because the incremental
	// algorithm does not really know or use the set of roots and can make a
	// different (implicit) decision about which nodes within an infinite loop
	// becomes a root.
	- if (DT.isVirtualRoot(TN->getIDom())) {
	+ if (TN && !DT.isVirtualRoot(TN->getIDom())) {
	DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
	<< " is not virtual root's child\n"
	<< "The entire tree needs to be rebuilt\n");
	// It should be possible to rotate the subtree instead of recalculating
	// the whole tree, but this situation happens extremely rarely in
	// practice.
	CalculateFromScratch(DT, BUI);
	return;
	}
	}
	}

	// Handles insertion to a node already in the dominator tree.
	static void InsertReachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr From, const TreeNodePtr To) {
	DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock())
	<< " -> " << BlockNamePrinter(To->getBlock()) << "\n");
	if (IsPostDom && UpdateRootsBeforeInsertion(DT, BUI, From, To)) return;
	// DT.findNCD expects both pointers to be valid. When From is a virtual
	// root, then its CFG block pointer is a nullptr, so we have to 'compute'
	// the NCD manually.
	const NodePtr NCDBlock =
	(From->getBlock() && To->getBlock())
	? DT.findNearestCommonDominator(From->getBlock(), To->getBlock())
	: nullptr;
	assert(NCDBlock \|\| DT.isPostDominator());
	const TreeNodePtr NCD = DT.getNode(NCDBlock);
	assert(NCD);

	DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n");
	const TreeNodePtr ToIDom = To->getIDom();

	// Nothing affected -- NCA property holds.
	// (Based on the lemma 2.5 from the second paper.)
	if (NCD == To \|\| NCD == ToIDom) return;

	// Identify and collect affected nodes.
	InsertionInfo II;
	DEBUG(dbgs() << "Marking " << BlockNamePrinter(To) << " as affected\n");
	II.Affected.insert(To);
	const unsigned ToLevel = To->getLevel();
	DEBUG(dbgs() << "Putting " << BlockNamePrinter(To) << " into a Bucket\n");
	II.Bucket.push({ToLevel, To});

	while (!II.Bucket.empty()) {
	const TreeNodePtr CurrentNode = II.Bucket.top().second;
	+ const unsigned CurrentLevel = CurrentNode->getLevel();
	II.Bucket.pop();
	DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
	<< BlockNamePrinter(CurrentNode) << "\n");
	- II.Visited.insert(CurrentNode);
	+
	+ II.Visited.insert({CurrentNode, CurrentLevel});
	II.AffectedQueue.push_back(CurrentNode);

	// Discover and collect affected successors of the current node.
	- VisitInsertion(DT, BUI, CurrentNode, CurrentNode->getLevel(), NCD, II);
	+ VisitInsertion(DT, BUI, CurrentNode, CurrentLevel, NCD, II);
	}

	// Finish by updating immediate dominators and levels.
	UpdateInsertion(DT, BUI, NCD, II);
	}

	// Visits an affected node and collect its affected successors.
	static void VisitInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr TN, const unsigned RootLevel,
	const TreeNodePtr NCD, InsertionInfo &II) {
	const unsigned NCDLevel = NCD->getLevel();
	- DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << "\n");
	+ DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << ", RootLevel "
	+ << RootLevel << "\n");

	SmallVector<TreeNodePtr, 8> Stack = {TN};
	assert(TN->getBlock() && II.Visited.count(TN) && "Preconditions!");

	+ SmallPtrSet<TreeNodePtr, 8> Processed;
	+
	do {
	TreeNodePtr Next = Stack.pop_back_val();
	+ DEBUG(dbgs() << " Next: " << BlockNamePrinter(Next) << "\n");

	for (const NodePtr Succ :
	ChildrenGetter<IsPostDom>::Get(Next->getBlock(), BUI)) {
	const TreeNodePtr SuccTN = DT.getNode(Succ);
	assert(SuccTN && "Unreachable successor found at reachable insertion");
	const unsigned SuccLevel = SuccTN->getLevel();

	- DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ)
	- << ", level = " << SuccLevel << "\n");
	+ DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ) << ", level = "
	+ << SuccLevel << "\n");

	+ // Do not process the same node multiple times.
	+ if (Processed.count(Next) > 0)
	+ continue;
	+
	// Succ dominated by subtree From -- not affected.
	// (Based on the lemma 2.5 from the second paper.)
	if (SuccLevel > RootLevel) {
	DEBUG(dbgs() << "\t\tDominated by subtree From\n");
	- if (II.Visited.count(SuccTN) != 0)
	- continue;
	+ if (II.Visited.count(SuccTN) != 0) {
	+ DEBUG(dbgs() << "\t\t\talready visited at level "
	+ << II.Visited[SuccTN] << "\n\t\t\tcurrent level "
	+ << RootLevel << ")\n");

	+ // A node can be necessary to visit again if we see it again at
	+ // a lower level than before.
	+ if (II.Visited[SuccTN] >= RootLevel)
	+ continue;
	+ }
	+
	DEBUG(dbgs() << "\t\tMarking visited not affected "
	<< BlockNamePrinter(Succ) << "\n");
	- II.Visited.insert(SuccTN);
	+ II.Visited.insert({SuccTN, RootLevel});
	II.VisitedNotAffectedQueue.push_back(SuccTN);
	Stack.push_back(SuccTN);
	} else if ((SuccLevel > NCDLevel + 1) &&
	II.Affected.count(SuccTN) == 0) {
	DEBUG(dbgs() << "\t\tMarking affected and adding "
	<< BlockNamePrinter(Succ) << " to a Bucket\n");
	II.Affected.insert(SuccTN);
	II.Bucket.push({SuccLevel, SuccTN});
	}
	}
	+
	+ Processed.insert(Next);
	} while (!Stack.empty());
	}

	// Updates immediate dominators and levels after insertion.
	static void UpdateInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr NCD, InsertionInfo &II) {
	DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n");

	for (const TreeNodePtr TN : II.AffectedQueue) {
	DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN)
	<< ") = " << BlockNamePrinter(NCD) << "\n");
	TN->setIDom(NCD);
	}

	UpdateLevelsAfterInsertion(II);
	if (IsPostDom) UpdateRootsAfterUpdate(DT, BUI);
	}

	static void UpdateLevelsAfterInsertion(InsertionInfo &II) {
	DEBUG(dbgs() << "Updating levels for visited but not affected nodes\n");

	for (const TreeNodePtr TN : II.VisitedNotAffectedQueue) {
	DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = ("
	<< BlockNamePrinter(TN->getIDom()) << ") "
	<< TN->getIDom()->getLevel() << " + 1\n");
	TN->UpdateLevel();
	}
	}

	// Handles insertion to previously unreachable nodes.
	static void InsertUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr From, const NodePtr To) {
	DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From)
	<< " -> (unreachable) " << BlockNamePrinter(To) << "\n");

	// Collect discovered edges to already reachable nodes.
	SmallVector<std::pair<NodePtr, TreeNodePtr>, 8> DiscoveredEdgesToReachable;
	// Discover and connect nodes that became reachable with the insertion.
	ComputeUnreachableDominators(DT, BUI, To, From, DiscoveredEdgesToReachable);

	DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From)
	<< " -> (prev unreachable) " << BlockNamePrinter(To) << "\n");

	// Used the discovered edges and inset discovered connecting (incoming)
	// edges.
	for (const auto &Edge : DiscoveredEdgesToReachable) {
	DEBUG(dbgs() << "\tInserting discovered connecting edge "
	<< BlockNamePrinter(Edge.first) << " -> "
	<< BlockNamePrinter(Edge.second) << "\n");
	InsertReachable(DT, BUI, DT.getNode(Edge.first), Edge.second);
	}
	}

	// Connects nodes that become reachable with an insertion.
	static void ComputeUnreachableDominators(
	DomTreeT &DT, const BatchUpdatePtr BUI, const NodePtr Root,
	const TreeNodePtr Incoming,
	SmallVectorImpl<std::pair<NodePtr, TreeNodePtr>>
	&DiscoveredConnectingEdges) {
	assert(!DT.getNode(Root) && "Root must not be reachable");

	// Visit only previously unreachable nodes.
	auto UnreachableDescender = [&DT, &DiscoveredConnectingEdges](NodePtr From,
	NodePtr To) {
	const TreeNodePtr ToTN = DT.getNode(To);
	if (!ToTN) return true;

	DiscoveredConnectingEdges.push_back({From, ToTN});
	return false;
	};

	SemiNCAInfo SNCA(BUI);
	SNCA.runDFS(Root, 0, UnreachableDescender, 0);
	SNCA.runSemiNCA(DT);
	SNCA.attachNewSubtree(DT, Incoming);

	DEBUG(dbgs() << "After adding unreachable nodes\n");
	}

	static void DeleteEdge(DomTreeT &DT, const BatchUpdatePtr BUI,
	const NodePtr From, const NodePtr To) {
	assert(From && To && "Cannot disconnect nullptrs");
	DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> "
	<< BlockNamePrinter(To) << "\n");

	#ifndef NDEBUG
	// Ensure that the edge was in fact deleted from the CFG before informing
	// the DomTree about it.
	// The check is O(N), so run it only in debug configuration.
	auto IsSuccessor = [BUI](const NodePtr SuccCandidate, const NodePtr Of) {
	auto Successors = ChildrenGetter<IsPostDom>::Get(Of, BUI);
	return llvm::find(Successors, SuccCandidate) != Successors.end();
	};
	(void)IsSuccessor;
	assert(!IsSuccessor(To, From) && "Deleted edge still exists in the CFG!");
	#endif

	const TreeNodePtr FromTN = DT.getNode(From);
	// Deletion in an unreachable subtree -- nothing to do.
	if (!FromTN) return;

	const TreeNodePtr ToTN = DT.getNode(To);
	if (!ToTN) {
	DEBUG(dbgs() << "\tTo (" << BlockNamePrinter(To)
	<< ") already unreachable -- there is no edge to delete\n");
	return;
	}

	const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To);
	const TreeNodePtr NCD = DT.getNode(NCDBlock);

	- // To dominates From -- nothing to do.
	- if (ToTN == NCD) return;
	+ // If To dominates From -- nothing to do.
	+ if (ToTN != NCD) {
	+ DT.DFSInfoValid = false;

	- DT.DFSInfoValid = false;
	+ const TreeNodePtr ToIDom = ToTN->getIDom();
	+ DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
	+ << BlockNamePrinter(ToIDom) << "\n");

	- const TreeNodePtr ToIDom = ToTN->getIDom();
	- DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
	- << BlockNamePrinter(ToIDom) << "\n");
	-
	- // To remains reachable after deletion.
	- // (Based on the caption under Figure 4. from the second paper.)
	- if (FromTN != ToIDom \|\| HasProperSupport(DT, BUI, ToTN))
	- DeleteReachable(DT, BUI, FromTN, ToTN);
	- else
	- DeleteUnreachable(DT, BUI, ToTN);
	+ // To remains reachable after deletion.
	+ // (Based on the caption under Figure 4. from the second paper.)
	+ if (FromTN != ToIDom \|\| HasProperSupport(DT, BUI, ToTN))
	+ DeleteReachable(DT, BUI, FromTN, ToTN);
	+ else
	+ DeleteUnreachable(DT, BUI, ToTN);
	+ }

	if (IsPostDom) UpdateRootsAfterUpdate(DT, BUI);
	}

	// Handles deletions that leave destination nodes reachable.
	static void DeleteReachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr FromTN,
	const TreeNodePtr ToTN) {
	DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN) << " -> "
	<< BlockNamePrinter(ToTN) << "\n");
	DEBUG(dbgs() << "\tRebuilding subtree\n");

	// Find the top of the subtree that needs to be rebuilt.
	// (Based on the lemma 2.6 from the second paper.)
	const NodePtr ToIDom =
	DT.findNearestCommonDominator(FromTN->getBlock(), ToTN->getBlock());
	assert(ToIDom \|\| DT.isPostDominator());
	const TreeNodePtr ToIDomTN = DT.getNode(ToIDom);
	assert(ToIDomTN);
	const TreeNodePtr PrevIDomSubTree = ToIDomTN->getIDom();
	// Top of the subtree to rebuild is the root node. Rebuild the tree from
	// scratch.
	if (!PrevIDomSubTree) {
	DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
	CalculateFromScratch(DT, BUI);
	return;
	}

	// Only visit nodes in the subtree starting at To.
	const unsigned Level = ToIDomTN->getLevel();
	auto DescendBelow = [Level, &DT](NodePtr, NodePtr To) {
	return DT.getNode(To)->getLevel() > Level;
	};

	DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN) << "\n");

	SemiNCAInfo SNCA(BUI);
	SNCA.runDFS(ToIDom, 0, DescendBelow, 0);
	DEBUG(dbgs() << "\tRunning Semi-NCA\n");
	SNCA.runSemiNCA(DT, Level);
	SNCA.reattachExistingSubtree(DT, PrevIDomSubTree);
	}

	// Checks if a node has proper support, as defined on the page 3 and later
	// explained on the page 7 of the second paper.
	static bool HasProperSupport(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr TN) {
	DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n");
	for (const NodePtr Pred :
	ChildrenGetter<!IsPostDom>::Get(TN->getBlock(), BUI)) {
	DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
	if (!DT.getNode(Pred)) continue;

	const NodePtr Support =
	DT.findNearestCommonDominator(TN->getBlock(), Pred);
	DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
	if (Support != TN->getBlock()) {
	DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
	<< " is reachable from support "
	<< BlockNamePrinter(Support) << "\n");
	return true;
	}
	}

	return false;
	}

	// Handle deletions that make destination node unreachable.
	// (Based on the lemma 2.7 from the second paper.)
	static void DeleteUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr ToTN) {
	DEBUG(dbgs() << "Deleting unreachable subtree " << BlockNamePrinter(ToTN)
	<< "\n");
	assert(ToTN);
	assert(ToTN->getBlock());

	if (IsPostDom) {
	// Deletion makes a region reverse-unreachable and creates a new root.
	// Simulate that by inserting an edge from the virtual root to ToTN and
	// adding it as a new root.
	DEBUG(dbgs() << "\tDeletion made a region reverse-unreachable\n");
	DEBUG(dbgs() << "\tAdding new root " << BlockNamePrinter(ToTN) << "\n");
	DT.Roots.push_back(ToTN->getBlock());
	InsertReachable(DT, BUI, DT.getNode(nullptr), ToTN);
	return;
	}

	SmallVector<NodePtr, 16> AffectedQueue;
	const unsigned Level = ToTN->getLevel();

	// Traverse destination node's descendants with greater level in the tree
	// and collect visited nodes.
	auto DescendAndCollect = [Level, &AffectedQueue, &DT](NodePtr, NodePtr To) {
	const TreeNodePtr TN = DT.getNode(To);
	assert(TN);
	if (TN->getLevel() > Level) return true;
	if (llvm::find(AffectedQueue, To) == AffectedQueue.end())
	AffectedQueue.push_back(To);

	return false;
	};

	SemiNCAInfo SNCA(BUI);
	unsigned LastDFSNum =
	SNCA.runDFS(ToTN->getBlock(), 0, DescendAndCollect, 0);

	TreeNodePtr MinNode = ToTN;

	// Identify the top of the subtree to rebuild by finding the NCD of all
	// the affected nodes.
	for (const NodePtr N : AffectedQueue) {
	const TreeNodePtr TN = DT.getNode(N);
	const NodePtr NCDBlock =
	DT.findNearestCommonDominator(TN->getBlock(), ToTN->getBlock());
	assert(NCDBlock \|\| DT.isPostDominator());
	const TreeNodePtr NCD = DT.getNode(NCDBlock);
	assert(NCD);

	DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN)
	<< " with NCD = " << BlockNamePrinter(NCD)
	<< ", MinNode =" << BlockNamePrinter(MinNode) << "\n");
	if (NCD != TN && NCD->getLevel() < MinNode->getLevel()) MinNode = NCD;
	}

	// Root reached, rebuild the whole tree from scratch.
	if (!MinNode->getIDom()) {
	DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
	CalculateFromScratch(DT, BUI);
	return;
	}

	// Erase the unreachable subtree in reverse preorder to process all children
	// before deleting their parent.
	for (unsigned i = LastDFSNum; i > 0; --i) {
	const NodePtr N = SNCA.NumToNode[i];
	const TreeNodePtr TN = DT.getNode(N);
	DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");

	EraseNode(DT, TN);
	}

	// The affected subtree start at the To node -- there's no extra work to do.
	if (MinNode == ToTN) return;

	DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = "
	<< BlockNamePrinter(MinNode) << "\n");
	const unsigned MinLevel = MinNode->getLevel();
	const TreeNodePtr PrevIDom = MinNode->getIDom();
	assert(PrevIDom);
	SNCA.clear();

	// Identify nodes that remain in the affected subtree.
	auto DescendBelow = [MinLevel, &DT](NodePtr, NodePtr To) {
	const TreeNodePtr ToTN = DT.getNode(To);
	return ToTN && ToTN->getLevel() > MinLevel;
	};
	SNCA.runDFS(MinNode->getBlock(), 0, DescendBelow, 0);

	DEBUG(dbgs() << "Previous IDom(MinNode) = " << BlockNamePrinter(PrevIDom)
	<< "\nRunning Semi-NCA\n");

	// Rebuild the remaining part of affected subtree.
	SNCA.runSemiNCA(DT, MinLevel);
	SNCA.reattachExistingSubtree(DT, PrevIDom);
	}

	// Removes leaf tree nodes from the dominator tree.
	static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) {
	assert(TN);
	assert(TN->getNumChildren() == 0 && "Not a tree leaf");

	const TreeNodePtr IDom = TN->getIDom();
	assert(IDom);

	auto ChIt = llvm::find(IDom->Children, TN);
	assert(ChIt != IDom->Children.end());
	std::swap(*ChIt, IDom->Children.back());
	IDom->Children.pop_back();

	DT.DomTreeNodes.erase(TN->getBlock());
	}

	//~~
	//===--------------------- DomTree Batch Updater --------------------------===
	//~~

	static void ApplyUpdates(DomTreeT &DT, ArrayRef<UpdateT> Updates) {
	const size_t NumUpdates = Updates.size();
	if (NumUpdates == 0)
	return;

	// Take the fast path for a single update and avoid running the batch update
	// machinery.
	if (NumUpdates == 1) {
	const auto &Update = Updates.front();
	if (Update.getKind() == UpdateKind::Insert)
	DT.insertEdge(Update.getFrom(), Update.getTo());
	else
	DT.deleteEdge(Update.getFrom(), Update.getTo());

	return;
	}

	BatchUpdateInfo BUI;
	LegalizeUpdates(Updates, BUI.Updates);

	const size_t NumLegalized = BUI.Updates.size();
	BUI.FutureSuccessors.reserve(NumLegalized);
	BUI.FuturePredecessors.reserve(NumLegalized);

	// Use the legalized future updates to initialize future successors and
	// predecessors. Note that these sets will only decrease size over time, as
	// the next CFG snapshots slowly approach the actual (current) CFG.
	for (UpdateT &U : BUI.Updates) {
	BUI.FutureSuccessors[U.getFrom()].insert({U.getTo(), U.getKind()});
	BUI.FuturePredecessors[U.getTo()].insert({U.getFrom(), U.getKind()});
	}

	DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n");
	DEBUG(if (NumLegalized < 32) for (const auto &U
	: reverse(BUI.Updates)) dbgs()
	<< '\t' << U << "\n");
	DEBUG(dbgs() << "\n");

	// If the DominatorTree was recalculated at some point, stop the batch
	// updates. Full recalculations ignore batch updates and look at the actual
	// CFG.
	for (size_t i = 0; i < NumLegalized && !BUI.IsRecalculated; ++i)
	ApplyNextUpdate(DT, BUI);
	}

	// This function serves double purpose:
	// a) It removes redundant updates, which makes it easier to reverse-apply
	// them when traversing CFG.
	// b) It optimizes away updates that cancel each other out, as the end result
	// is the same.
	//
	// It relies on the property of the incremental updates that says that the
	// order of updates doesn't matter. This allows us to reorder them and end up
	// with the exact same DomTree every time.
	//
	// Following the same logic, the function doesn't care about the order of
	// input updates, so it's OK to pass it an unordered sequence of updates, that
	// doesn't make sense when applied sequentially, eg. performing double
	// insertions or deletions and then doing an opposite update.
	//
	// In the future, it should be possible to schedule updates in way that
	// minimizes the amount of work needed done during incremental updates.
	static void LegalizeUpdates(ArrayRef<UpdateT> AllUpdates,
	SmallVectorImpl<UpdateT> &Result) {
	DEBUG(dbgs() << "Legalizing " << AllUpdates.size() << " updates\n");
	// Count the total number of inserions of each edge.
	// Each insertion adds 1 and deletion subtracts 1. The end number should be
	// one of {-1 (deletion), 0 (NOP), +1 (insertion)}. Otherwise, the sequence
	// of updates contains multiple updates of the same kind and we assert for
	// that case.
	SmallDenseMap<std::pair<NodePtr, NodePtr>, int, 4> Operations;
	Operations.reserve(AllUpdates.size());

	for (const auto &U : AllUpdates) {
	NodePtr From = U.getFrom();
	NodePtr To = U.getTo();
	if (IsPostDom) std::swap(From, To); // Reverse edge for postdominators.

	Operations[{From, To}] += (U.getKind() == UpdateKind::Insert ? 1 : -1);
	}

	Result.clear();
	Result.reserve(Operations.size());
	for (auto &Op : Operations) {
	const int NumInsertions = Op.second;
	assert(std::abs(NumInsertions) <= 1 && "Unbalanced operations!");
	if (NumInsertions == 0) continue;
	const UpdateKind UK =
	NumInsertions > 0 ? UpdateKind::Insert : UpdateKind::Delete;
	Result.push_back({UK, Op.first.first, Op.first.second});
	}

	// Make the order consistent by not relying on pointer values within the
	// set. Reuse the old Operations map.
	// In the future, we should sort by something else to minimize the amount
	// of work needed to perform the series of updates.
	for (size_t i = 0, e = AllUpdates.size(); i != e; ++i) {
	const auto &U = AllUpdates[i];
	if (!IsPostDom)
	Operations[{U.getFrom(), U.getTo()}] = int(i);
	else
	Operations[{U.getTo(), U.getFrom()}] = int(i);
	}

	std::sort(Result.begin(), Result.end(),
	[&Operations](const UpdateT &A, const UpdateT &B) {
	return Operations[{A.getFrom(), A.getTo()}] >
	Operations[{B.getFrom(), B.getTo()}];
	});
	}

	static void ApplyNextUpdate(DomTreeT &DT, BatchUpdateInfo &BUI) {
	assert(!BUI.Updates.empty() && "No updates to apply!");
	UpdateT CurrentUpdate = BUI.Updates.pop_back_val();
	DEBUG(dbgs() << "Applying update: " << CurrentUpdate << "\n");

	// Move to the next snapshot of the CFG by removing the reverse-applied
	// current update.
	auto &FS = BUI.FutureSuccessors[CurrentUpdate.getFrom()];
	FS.erase({CurrentUpdate.getTo(), CurrentUpdate.getKind()});
	if (FS.empty()) BUI.FutureSuccessors.erase(CurrentUpdate.getFrom());

	auto &FP = BUI.FuturePredecessors[CurrentUpdate.getTo()];
	FP.erase({CurrentUpdate.getFrom(), CurrentUpdate.getKind()});
	if (FP.empty()) BUI.FuturePredecessors.erase(CurrentUpdate.getTo());

	if (CurrentUpdate.getKind() == UpdateKind::Insert)
	InsertEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo());
	else
	DeleteEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo());
	}

	//~~
	//===--------------- DomTree correctness verification ---------------------===
	//~~

	// Check if the tree has correct roots. A DominatorTree always has a single
	// root which is the function's entry node. A PostDominatorTree can have
	// multiple roots - one for each node with no successors and for infinite
	// loops.
	bool verifyRoots(const DomTreeT &DT) {
	if (!DT.Parent && !DT.Roots.empty()) {
	errs() << "Tree has no parent but has roots!\n";
	errs().flush();
	return false;
	}

	if (!IsPostDom) {
	if (DT.Roots.empty()) {
	errs() << "Tree doesn't have a root!\n";
	errs().flush();
	return false;
	}

	if (DT.getRoot() != GetEntryNode(DT)) {
	errs() << "Tree's root is not its parent's entry node!\n";
	errs().flush();
	return false;
	}
	}

	RootsT ComputedRoots = FindRoots(DT, nullptr);
	if (DT.Roots.size() != ComputedRoots.size() \|\|
	!std::is_permutation(DT.Roots.begin(), DT.Roots.end(),
	ComputedRoots.begin())) {
	errs() << "Tree has different roots than freshly computed ones!\n";
	errs() << "\tPDT roots: ";
	for (const NodePtr N : DT.Roots) errs() << BlockNamePrinter(N) << ", ";
	errs() << "\n\tComputed roots: ";
	for (const NodePtr N : ComputedRoots)
	errs() << BlockNamePrinter(N) << ", ";
	errs() << "\n";
	errs().flush();
	return false;
	}

	return true;
	}

	// Checks if the tree contains all reachable nodes in the input graph.
	bool verifyReachability(const DomTreeT &DT) {
	clear();
	doFullDFSWalk(DT, AlwaysDescend);

	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();

	// Virtual root has a corresponding virtual CFG node.
	if (DT.isVirtualRoot(TN)) continue;

	if (NodeToInfo.count(BB) == 0) {
	errs() << "DomTree node " << BlockNamePrinter(BB)
	<< " not found by DFS walk!\n";
	errs().flush();

	return false;
	}
	}

	for (const NodePtr N : NumToNode) {
	if (N && !DT.getNode(N)) {
	errs() << "CFG node " << BlockNamePrinter(N)
	<< " not found in the DomTree!\n";
	errs().flush();

	return false;
	}
	}

	return true;
	}

	// Check if for every parent with a level L in the tree all of its children
	// have level L + 1.
	static bool VerifyLevels(const DomTreeT &DT) {
	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();
	if (!BB) continue;

	const TreeNodePtr IDom = TN->getIDom();
	if (!IDom && TN->getLevel() != 0) {
	errs() << "Node without an IDom " << BlockNamePrinter(BB)
	<< " has a nonzero level " << TN->getLevel() << "!\n";
	errs().flush();

	return false;
	}

	if (IDom && TN->getLevel() != IDom->getLevel() + 1) {
	errs() << "Node " << BlockNamePrinter(BB) << " has level "
	<< TN->getLevel() << " while its IDom "
	<< BlockNamePrinter(IDom->getBlock()) << " has level "
	<< IDom->getLevel() << "!\n";
	errs().flush();

	return false;
	}
	}

	return true;
	}

	// Check if the computed DFS numbers are correct. Note that DFS info may not
	// be valid, and when that is the case, we don't verify the numbers.
	static bool VerifyDFSNumbers(const DomTreeT &DT) {
	if (!DT.DFSInfoValid \|\| !DT.Parent)
	return true;

	const NodePtr RootBB = IsPostDom ? nullptr : DT.getRoots()[0];
	const TreeNodePtr Root = DT.getNode(RootBB);

	auto PrintNodeAndDFSNums = [](const TreeNodePtr TN) {
	errs() << BlockNamePrinter(TN) << " {" << TN->getDFSNumIn() << ", "
	<< TN->getDFSNumOut() << '}';
	};

	// Verify the root's DFS In number. Although DFS numbering would also work
	// if we started from some other value, we assume 0-based numbering.
	if (Root->getDFSNumIn() != 0) {
	errs() << "DFSIn number for the tree root is not:\n\t";
	PrintNodeAndDFSNums(Root);
	errs() << '\n';
	errs().flush();
	return false;
	}

	// For each tree node verify if children's DFS numbers cover their parent's
	// DFS numbers with no gaps.
	for (const auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr Node = NodeToTN.second.get();

	// Handle tree leaves.
	if (Node->getChildren().empty()) {
	if (Node->getDFSNumIn() + 1 != Node->getDFSNumOut()) {
	errs() << "Tree leaf should have DFSOut = DFSIn + 1:\n\t";
	PrintNodeAndDFSNums(Node);
	errs() << '\n';
	errs().flush();
	return false;
	}

	continue;
	}

	// Make a copy and sort it such that it is possible to check if there are
	// no gaps between DFS numbers of adjacent children.
	SmallVector<TreeNodePtr, 8> Children(Node->begin(), Node->end());
	std::sort(Children.begin(), Children.end(),
	[](const TreeNodePtr Ch1, const TreeNodePtr Ch2) {
	return Ch1->getDFSNumIn() < Ch2->getDFSNumIn();
	});

	auto PrintChildrenError = [Node, &Children, PrintNodeAndDFSNums](
	const TreeNodePtr FirstCh, const TreeNodePtr SecondCh) {
	assert(FirstCh);

	errs() << "Incorrect DFS numbers for:\n\tParent ";
	PrintNodeAndDFSNums(Node);

	errs() << "\n\tChild ";
	PrintNodeAndDFSNums(FirstCh);

	if (SecondCh) {
	errs() << "\n\tSecond child ";
	PrintNodeAndDFSNums(SecondCh);
	}

	errs() << "\nAll children: ";
	for (const TreeNodePtr Ch : Children) {
	PrintNodeAndDFSNums(Ch);
	errs() << ", ";
	}

	errs() << '\n';
	errs().flush();
	};

	if (Children.front()->getDFSNumIn() != Node->getDFSNumIn() + 1) {
	PrintChildrenError(Children.front(), nullptr);
	return false;
	}

	if (Children.back()->getDFSNumOut() + 1 != Node->getDFSNumOut()) {
	PrintChildrenError(Children.back(), nullptr);
	return false;
	}

	for (size_t i = 0, e = Children.size() - 1; i != e; ++i) {
	if (Children[i]->getDFSNumOut() + 1 != Children[i + 1]->getDFSNumIn()) {
	PrintChildrenError(Children[i], Children[i + 1]);
	return false;
	}
	}
	}

	return true;
	}

	// The below routines verify the correctness of the dominator tree relative to
	// the CFG it's coming from. A tree is a dominator tree iff it has two
	// properties, called the parent property and the sibling property. Tarjan
	// and Lengauer prove (but don't explicitly name) the properties as part of
	// the proofs in their 1972 paper, but the proofs are mostly part of proving
	// things about semidominators and idoms, and some of them are simply asserted
	// based on even earlier papers (see, e.g., lemma 2). Some papers refer to
	// these properties as "valid" and "co-valid". See, e.g., "Dominators,
	// directed bipolar orders, and independent spanning trees" by Loukas
	// Georgiadis and Robert E. Tarjan, as well as "Dominator Tree Verification
	// and Vertex-Disjoint Paths " by the same authors.

	// A very simple and direct explanation of these properties can be found in
	// "An Experimental Study of Dynamic Dominators", found at
	// https://arxiv.org/abs/1604.02711

	// The easiest way to think of the parent property is that it's a requirement
	// of being a dominator. Let's just take immediate dominators. For PARENT to
	// be an immediate dominator of CHILD, all paths in the CFG must go through
	// PARENT before they hit CHILD. This implies that if you were to cut PARENT
	// out of the CFG, there should be no paths to CHILD that are reachable. If
	// there are, then you now have a path from PARENT to CHILD that goes around
	// PARENT and still reaches CHILD, which by definition, means PARENT can't be
	// a dominator of CHILD (let alone an immediate one).

	// The sibling property is similar. It says that for each pair of sibling
	// nodes in the dominator tree (LEFT and RIGHT) , they must not dominate each
	// other. If sibling LEFT dominated sibling RIGHT, it means there are no
	// paths in the CFG from sibling LEFT to sibling RIGHT that do not go through
	// LEFT, and thus, LEFT is really an ancestor (in the dominator tree) of
	// RIGHT, not a sibling.

	// It is possible to verify the parent and sibling properties in
	// linear time, but the algorithms are complex. Instead, we do it in a
	// straightforward N^2 and N^3 way below, using direct path reachability.


	// Checks if the tree has the parent property: if for all edges from V to W in
	// the input graph, such that V is reachable, the parent of W in the tree is
	// an ancestor of V in the tree.
	//
	// This means that if a node gets disconnected from the graph, then all of
	// the nodes it dominated previously will now become unreachable.
	bool verifyParentProperty(const DomTreeT &DT) {
	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();
	if (!BB \|\| TN->getChildren().empty()) continue;

	DEBUG(dbgs() << "Verifying parent property of node "
	<< BlockNamePrinter(TN) << "\n");
	clear();
	doFullDFSWalk(DT, [BB](NodePtr From, NodePtr To) {
	return From != BB && To != BB;
	});

	for (TreeNodePtr Child : TN->getChildren())
	if (NodeToInfo.count(Child->getBlock()) != 0) {
	errs() << "Child " << BlockNamePrinter(Child)
	<< " reachable after its parent " << BlockNamePrinter(BB)
	<< " is removed!\n";
	errs().flush();

	return false;
	}
	}

	return true;
	}

	// Check if the tree has sibling property: if a node V does not dominate a
	// node W for all siblings V and W in the tree.
	//
	// This means that if a node gets disconnected from the graph, then all of its
	// siblings will now still be reachable.
	bool verifySiblingProperty(const DomTreeT &DT) {
	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();
	if (!BB \|\| TN->getChildren().empty()) continue;

	const auto &Siblings = TN->getChildren();
	for (const TreeNodePtr N : Siblings) {
	clear();
	NodePtr BBN = N->getBlock();
	doFullDFSWalk(DT, [BBN](NodePtr From, NodePtr To) {
	return From != BBN && To != BBN;
	});

	for (const TreeNodePtr S : Siblings) {
	if (S == N) continue;

	if (NodeToInfo.count(S->getBlock()) == 0) {
	errs() << "Node " << BlockNamePrinter(S)
	<< " not reachable when its sibling " << BlockNamePrinter(N)
	<< " is removed!\n";
	errs().flush();

	return false;
	}
	}
	}
	}

	return true;
	}
	};

	template <class DomTreeT>
	void Calculate(DomTreeT &DT) {
	SemiNCAInfo<DomTreeT>::CalculateFromScratch(DT, nullptr);
	}

	template <class DomTreeT>
	void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
	typename DomTreeT::NodePtr To) {
	if (DT.isPostDominator()) std::swap(From, To);
	SemiNCAInfo<DomTreeT>::InsertEdge(DT, nullptr, From, To);
	}

	template <class DomTreeT>
	void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
	typename DomTreeT::NodePtr To) {
	if (DT.isPostDominator()) std::swap(From, To);
	SemiNCAInfo<DomTreeT>::DeleteEdge(DT, nullptr, From, To);
	}

	template <class DomTreeT>
	void ApplyUpdates(DomTreeT &DT,
	ArrayRef<typename DomTreeT::UpdateType> Updates) {
	SemiNCAInfo<DomTreeT>::ApplyUpdates(DT, Updates);
	}

	template <class DomTreeT>
	bool Verify(const DomTreeT &DT) {
	SemiNCAInfo<DomTreeT> SNCA(nullptr);
	return SNCA.verifyRoots(DT) && SNCA.verifyReachability(DT) &&
	SNCA.VerifyLevels(DT) && SNCA.verifyParentProperty(DT) &&
	SNCA.verifySiblingProperty(DT) && SNCA.VerifyDFSNumbers(DT);
	}

	} // namespace DomTreeBuilder
	} // namespace llvm

	#undef DEBUG_TYPE

	#endif
	Index: vendor/llvm/dist-release_60/include/llvm/Transforms/Vectorize/SLPVectorizer.h
	===================================================================
	--- vendor/llvm/dist-release_60/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 328361)
	+++ vendor/llvm/dist-release_60/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 328362)
	@@ -1,157 +1,152 @@
	//===- SLPVectorizer.h ------------------------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
	// stores that can be put together into vector-stores. Next, it attempts to
	// construct vectorizable tree using the use-def chains. If a profitable tree
	// was found, the SLP vectorizer performs vectorization on the tree.
	//
	// The pass is inspired by the work described in the paper:
	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
	#define LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H

	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/ValueHandle.h"

	namespace llvm {

	class AssumptionCache;
	class BasicBlock;
	class CmpInst;
	class DataLayout;
	class DemandedBits;
	class DominatorTree;
	class Function;
	class InsertElementInst;
	class InsertValueInst;
	class Instruction;
	class LoopInfo;
	class OptimizationRemarkEmitter;
	class PHINode;
	class ScalarEvolution;
	class StoreInst;
	class TargetLibraryInfo;
	class TargetTransformInfo;
	class Value;

	/// A private "module" namespace for types and utilities used by this pass.
	/// These are implementation details and should not be used by clients.
	namespace slpvectorizer {

	class BoUpSLP;

	} // end namespace slpvectorizer

	struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
	using StoreList = SmallVector<StoreInst *, 8>;
	using StoreListMap = MapVector<Value *, StoreList>;
	using WeakTrackingVHList = SmallVector<WeakTrackingVH, 8>;
	using WeakTrackingVHListMap = MapVector<Value *, WeakTrackingVHList>;

	ScalarEvolution *SE = nullptr;
	TargetTransformInfo *TTI = nullptr;
	TargetLibraryInfo *TLI = nullptr;
	AliasAnalysis *AA = nullptr;
	LoopInfo *LI = nullptr;
	DominatorTree *DT = nullptr;
	AssumptionCache *AC = nullptr;
	DemandedBits *DB = nullptr;
	const DataLayout *DL = nullptr;

	public:
	PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);

	// Glue for old PM.
	bool runImpl(Function &F, ScalarEvolution SE_, TargetTransformInfo TTI_,
	TargetLibraryInfo TLI_, AliasAnalysis AA_, LoopInfo *LI_,
	DominatorTree DT_, AssumptionCache AC_, DemandedBits *DB_,
	OptimizationRemarkEmitter *ORE_);

	private:
	/// \brief Collect store and getelementptr instructions and organize them
	/// according to the underlying object of their pointer operands. We sort the
	/// instructions by their underlying objects to reduce the cost of
	/// consecutive access queries.
	///
	/// TODO: We can further reduce this cost if we flush the chain creation
	/// every time we run into a memory barrier.
	void collectSeedInstructions(BasicBlock *BB);

	/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
	bool tryToVectorizePair(Value A, Value B, slpvectorizer::BoUpSLP &R);

	/// \brief Try to vectorize a list of operands.
	- /// \@param BuildVector A list of users to ignore for the purpose of
	- /// scheduling and cost estimation when NeedExtraction
	- /// is false.
	/// \returns true if a value was vectorized.
	bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
	- ArrayRef<Value *> BuildVector = None,
	- bool AllowReorder = false,
	- bool NeedExtraction = false);
	+ bool AllowReorder = false);

	/// \brief Try to vectorize a chain that may start at the operands of \p I.
	bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);

	/// \brief Vectorize the store instructions collected in Stores.
	bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);

	/// \brief Vectorize the index computations of the getelementptr instructions
	/// collected in GEPs.
	bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);

	/// Try to find horizontal reduction or otherwise vectorize a chain of binary
	/// operators.
	bool vectorizeRootInstruction(PHINode P, Value V, BasicBlock *BB,
	slpvectorizer::BoUpSLP &R,
	TargetTransformInfo *TTI);

	/// Try to vectorize trees that start at insertvalue instructions.
	bool vectorizeInsertValueInst(InsertValueInst IVI, BasicBlock BB,
	slpvectorizer::BoUpSLP &R);

	/// Try to vectorize trees that start at insertelement instructions.
	bool vectorizeInsertElementInst(InsertElementInst IEI, BasicBlock BB,
	slpvectorizer::BoUpSLP &R);

	/// Try to vectorize trees that start at compare instructions.
	bool vectorizeCmpInst(CmpInst CI, BasicBlock BB, slpvectorizer::BoUpSLP &R);

	/// Tries to vectorize constructs started from CmpInst, InsertValueInst or
	/// InsertElementInst instructions.
	bool vectorizeSimpleInstructions(SmallVectorImpl<WeakVH> &Instructions,
	BasicBlock *BB, slpvectorizer::BoUpSLP &R);

	/// \brief Scan the basic block and look for patterns that are likely to start
	/// a vectorization chain.
	bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);

	bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
	unsigned VecRegSize);

	bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);

	/// The store instructions in a basic block organized by base pointer.
	StoreListMap Stores;

	/// The getelementptr instructions in a basic block organized by base pointer.
	WeakTrackingVHListMap GEPs;
	};

	} // end namespace llvm

	#endif // LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
	Index: vendor/llvm/dist-release_60/lib/CodeGen/CodeGenPrepare.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/CodeGenPrepare.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/CodeGenPrepare.cpp (revision 328362)
	@@ -1,6583 +1,6588 @@
	//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass munges the code in the input function to better prepare it for
	// SelectionDAG-based code generation. This works around limitations in it's
	// basic-block-at-a-time approach. It should eventually be removed.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/ProfileSummaryInfo.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/MDBuilder.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/IR/ValueMap.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/BlockFrequency.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/BypassSlowDivision.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <limits>
	#include <memory>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "codegenprepare"

	STATISTIC(NumBlocksElim, "Number of blocks eliminated");
	STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated");
	STATISTIC(NumGEPsElim, "Number of GEPs converted to casts");
	STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
	"sunken Cmps");
	STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
	"of sunken Casts");
	STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
	"computations were sunk");
	STATISTIC(NumMemoryInstsPhiCreated,
	"Number of phis created when address "
	"computations were sunk to memory instructions");
	STATISTIC(NumMemoryInstsSelectCreated,
	"Number of select created when address "
	"computations were sunk to memory instructions");
	STATISTIC(NumExtsMoved, "Number of [s\|z]ext instructions combined with loads");
	STATISTIC(NumExtUses, "Number of uses of [s\|z]ext instructions optimized");
	STATISTIC(NumAndsAdded,
	"Number of and mask instructions added to form ext loads");
	STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
	STATISTIC(NumRetsDup, "Number of return instructions duplicated");
	STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
	STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
	STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");

	static cl::opt<bool> DisableBranchOpts(
	"disable-cgp-branch-opts", cl::Hidden, cl::init(false),
	cl::desc("Disable branch optimizations in CodeGenPrepare"));

	static cl::opt<bool>
	DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
	cl::desc("Disable GC optimizations in CodeGenPrepare"));

	static cl::opt<bool> DisableSelectToBranch(
	"disable-cgp-select2branch", cl::Hidden, cl::init(false),
	cl::desc("Disable select to branch conversion."));

	static cl::opt<bool> AddrSinkUsingGEPs(
	"addr-sink-using-gep", cl::Hidden, cl::init(true),
	cl::desc("Address sinking in CGP using GEPs."));

	static cl::opt<bool> EnableAndCmpSinking(
	"enable-andcmp-sinking", cl::Hidden, cl::init(true),
	cl::desc("Enable sinkinig and/cmp into branches."));

	static cl::opt<bool> DisableStoreExtract(
	"disable-cgp-store-extract", cl::Hidden, cl::init(false),
	cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));

	static cl::opt<bool> StressStoreExtract(
	"stress-cgp-store-extract", cl::Hidden, cl::init(false),
	cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));

	static cl::opt<bool> DisableExtLdPromotion(
	"disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
	cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
	"CodeGenPrepare"));

	static cl::opt<bool> StressExtLdPromotion(
	"stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
	cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
	"optimization in CodeGenPrepare"));

	static cl::opt<bool> DisablePreheaderProtect(
	"disable-preheader-prot", cl::Hidden, cl::init(false),
	cl::desc("Disable protection against removing loop preheaders"));

	static cl::opt<bool> ProfileGuidedSectionPrefix(
	"profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
	cl::desc("Use profile info to add section prefix for hot/cold functions"));

	static cl::opt<unsigned> FreqRatioToSkipMerge(
	"cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
	cl::desc("Skip merging empty blocks if (frequency of empty block) / "
	"(frequency of destination block) is greater than this ratio"));

	static cl::opt<bool> ForceSplitStore(
	"force-split-store", cl::Hidden, cl::init(false),
	cl::desc("Force store splitting no matter what the target query says."));

	static cl::opt<bool>
	EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
	cl::desc("Enable merging of redundant sexts when one is dominating"
	" the other."), cl::init(true));

	static cl::opt<bool> DisableComplexAddrModes(
	"disable-complex-addr-modes", cl::Hidden, cl::init(false),
	cl::desc("Disables combining addressing modes with different parts "
	"in optimizeMemoryInst."));

	static cl::opt<bool>
	AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
	cl::desc("Allow creation of Phis in Address sinking."));

	static cl::opt<bool>
	AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(false),
	cl::desc("Allow creation of selects in Address sinking."));

	static cl::opt<bool> AddrSinkCombineBaseReg(
	"addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of BaseReg field in Address sinking."));

	static cl::opt<bool> AddrSinkCombineBaseGV(
	"addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of BaseGV field in Address sinking."));

	static cl::opt<bool> AddrSinkCombineBaseOffs(
	"addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of BaseOffs field in Address sinking."));

	static cl::opt<bool> AddrSinkCombineScaledReg(
	"addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of ScaledReg field in Address sinking."));

	namespace {

	using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
	using TypeIsSExt = PointerIntPair<Type *, 1, bool>;
	using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
	using SExts = SmallVector<Instruction *, 16>;
	using ValueToSExts = DenseMap<Value *, SExts>;

	class TypePromotionTransaction;

	class CodeGenPrepare : public FunctionPass {
	const TargetMachine *TM = nullptr;
	const TargetSubtargetInfo *SubtargetInfo;
	const TargetLowering *TLI = nullptr;
	const TargetRegisterInfo *TRI;
	const TargetTransformInfo *TTI = nullptr;
	const TargetLibraryInfo *TLInfo;
	const LoopInfo *LI;
	std::unique_ptr<BlockFrequencyInfo> BFI;
	std::unique_ptr<BranchProbabilityInfo> BPI;

	/// As we scan instructions optimizing them, this is the next instruction
	/// to optimize. Transforms that can invalidate this should update it.
	BasicBlock::iterator CurInstIterator;

	/// Keeps track of non-local addresses that have been sunk into a block.
	/// This allows us to avoid inserting duplicate code for blocks with
	/// multiple load/stores of the same address. The usage of WeakTrackingVH
	/// enables SunkAddrs to be treated as a cache whose entries can be
	/// invalidated if a sunken address computation has been erased.
	ValueMap<Value*, WeakTrackingVH> SunkAddrs;

	/// Keeps track of all instructions inserted for the current function.
	SetOfInstrs InsertedInsts;

	/// Keeps track of the type of the related instruction before their
	/// promotion for the current function.
	InstrToOrigTy PromotedInsts;

	/// Keep track of instructions removed during promotion.
	SetOfInstrs RemovedInsts;

	/// Keep track of sext chains based on their initial value.
	DenseMap<Value , Instruction > SeenChainsForSExt;

	/// Keep track of SExt promoted.
	ValueToSExts ValToSExtendedUses;

	/// True if CFG is modified in any way.
	bool ModifiedDT;

	/// True if optimizing for size.
	bool OptSize;

	/// DataLayout for the Function being processed.
	const DataLayout *DL = nullptr;

	public:
	static char ID; // Pass identification, replacement for typeid

	CodeGenPrepare() : FunctionPass(ID) {
	initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override;

	StringRef getPassName() const override { return "CodeGen Prepare"; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	// FIXME: When we can selectively preserve passes, preserve the domtree.
	AU.addRequired<ProfileSummaryInfoWrapperPass>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	}

	private:
	bool eliminateFallThrough(Function &F);
	bool eliminateMostlyEmptyBlocks(Function &F);
	BasicBlock findDestBlockOfMergeableEmptyBlock(BasicBlock BB);
	bool canMergeBlocks(const BasicBlock BB, const BasicBlock DestBB) const;
	void eliminateMostlyEmptyBlock(BasicBlock *BB);
	bool isMergingEmptyBlockProfitable(BasicBlock BB, BasicBlock DestBB,
	bool isPreheader);
	bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
	bool optimizeInst(Instruction *I, bool &ModifiedDT);
	bool optimizeMemoryInst(Instruction I, Value Addr,
	Type *AccessTy, unsigned AS);
	bool optimizeInlineAsmInst(CallInst *CS);
	bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
	bool optimizeExt(Instruction *&I);
	bool optimizeExtUses(Instruction *I);
	bool optimizeLoadExt(LoadInst *I);
	bool optimizeSelectInst(SelectInst *SI);
	bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
	bool optimizeSwitchInst(SwitchInst *CI);
	bool optimizeExtractElementInst(Instruction *Inst);
	bool dupRetToEnableTailCallOpts(BasicBlock *BB);
	bool placeDbgValues(Function &F);
	bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
	LoadInst &LI, Instruction &Inst, bool HasPromoted);
	bool tryToPromoteExts(TypePromotionTransaction &TPT,
	const SmallVectorImpl<Instruction *> &Exts,
	SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
	unsigned CreatedInstsCost = 0);
	bool mergeSExts(Function &F);
	bool performAddressTypePromotion(
	Instruction *&Inst,
	bool AllowPromotionWithoutCommonHeader,
	bool HasPromoted, TypePromotionTransaction &TPT,
	SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
	bool splitBranchCondition(Function &F);
	bool simplifyOffsetableRelocate(Instruction &I);
	};

	} // end anonymous namespace

	char CodeGenPrepare::ID = 0;

	INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
	"Optimize for code generation", false, false)
	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
	INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
	"Optimize for code generation", false, false)

	FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }

	bool CodeGenPrepare::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;

	DL = &F.getParent()->getDataLayout();

	bool EverMadeChange = false;
	// Clear per function information.
	InsertedInsts.clear();
	PromotedInsts.clear();

	ModifiedDT = false;
	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
	TM = &TPC->getTM<TargetMachine>();
	SubtargetInfo = TM->getSubtargetImpl(F);
	TLI = SubtargetInfo->getTargetLowering();
	TRI = SubtargetInfo->getRegisterInfo();
	}
	TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
	TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	BPI.reset(new BranchProbabilityInfo(F, *LI));
	BFI.reset(new BlockFrequencyInfo(F, BPI, LI));
	OptSize = F.optForSize();

	ProfileSummaryInfo *PSI =
	getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
	if (ProfileGuidedSectionPrefix) {
	if (PSI->isFunctionHotInCallGraph(&F, *BFI))
	F.setSectionPrefix(".hot");
	else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
	F.setSectionPrefix(".unlikely");
	}

	/// This optimization identifies DIV instructions that can be
	/// profitably bypassed and carried out with a shorter, faster divide.
	if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&
	TLI->isSlowDivBypassed()) {
	const DenseMap<unsigned int, unsigned int> &BypassWidths =
	TLI->getBypassSlowDivWidths();
	BasicBlock* BB = &*F.begin();
	while (BB != nullptr) {
	// bypassSlowDivision may create new BBs, but we don't want to reapply the
	// optimization to those blocks.
	BasicBlock* Next = BB->getNextNode();
	EverMadeChange \|= bypassSlowDivision(BB, BypassWidths);
	BB = Next;
	}
	}

	// Eliminate blocks that contain only PHI nodes and an
	// unconditional branch.
	EverMadeChange \|= eliminateMostlyEmptyBlocks(F);

	// llvm.dbg.value is far away from the value then iSel may not be able
	// handle it properly. iSel will drop llvm.dbg.value if it can not
	// find a node corresponding to the value.
	EverMadeChange \|= placeDbgValues(F);

	if (!DisableBranchOpts)
	EverMadeChange \|= splitBranchCondition(F);

	// Split some critical edges where one of the sources is an indirect branch,
	// to help generate sane code for PHIs involving such edges.
	EverMadeChange \|= SplitIndirectBrCriticalEdges(F);

	bool MadeChange = true;
	while (MadeChange) {
	MadeChange = false;
	SeenChainsForSExt.clear();
	ValToSExtendedUses.clear();
	RemovedInsts.clear();
	for (Function::iterator I = F.begin(); I != F.end(); ) {
	BasicBlock BB = &I++;
	bool ModifiedDTOnIteration = false;
	MadeChange \|= optimizeBlock(*BB, ModifiedDTOnIteration);

	// Restart BB iteration if the dominator tree of the Function was changed
	if (ModifiedDTOnIteration)
	break;
	}
	if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
	MadeChange \|= mergeSExts(F);

	// Really free removed instructions during promotion.
	for (Instruction *I : RemovedInsts)
	I->deleteValue();

	EverMadeChange \|= MadeChange;
	}

	SunkAddrs.clear();

	if (!DisableBranchOpts) {
	MadeChange = false;
	SmallPtrSet<BasicBlock*, 8> WorkList;
	for (BasicBlock &BB : F) {
	SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
	MadeChange \|= ConstantFoldTerminator(&BB, true);
	if (!MadeChange) continue;

	for (SmallVectorImpl<BasicBlock*>::iterator
	II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
	if (pred_begin(II) == pred_end(II))
	WorkList.insert(*II);
	}

	// Delete the dead blocks and any of their dead successors.
	MadeChange \|= !WorkList.empty();
	while (!WorkList.empty()) {
	BasicBlock BB = WorkList.begin();
	WorkList.erase(BB);
	SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));

	DeleteDeadBlock(BB);

	for (SmallVectorImpl<BasicBlock*>::iterator
	II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
	if (pred_begin(II) == pred_end(II))
	WorkList.insert(*II);
	}

	// Merge pairs of basic blocks with unconditional branches, connected by
	// a single edge.
	if (EverMadeChange \|\| MadeChange)
	MadeChange \|= eliminateFallThrough(F);

	EverMadeChange \|= MadeChange;
	}

	if (!DisableGCOpts) {
	SmallVector<Instruction *, 2> Statepoints;
	for (BasicBlock &BB : F)
	for (Instruction &I : BB)
	if (isStatepoint(I))
	Statepoints.push_back(&I);
	for (auto &I : Statepoints)
	EverMadeChange \|= simplifyOffsetableRelocate(*I);
	}

	return EverMadeChange;
	}

	/// Merge basic blocks which are connected by a single edge, where one of the
	/// basic blocks has a single successor pointing to the other basic block,
	/// which has a single predecessor.
	bool CodeGenPrepare::eliminateFallThrough(Function &F) {
	bool Changed = false;
	// Scan all of the blocks in the function, except for the entry block.
	for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
	BasicBlock BB = &I++;
	// If the destination block has a single pred, then this is a trivial
	// edge, just collapse it.
	BasicBlock *SinglePred = BB->getSinglePredecessor();

	// Don't merge if BB's address is taken.
	if (!SinglePred \|\| SinglePred == BB \|\| BB->hasAddressTaken()) continue;

	BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
	if (Term && !Term->isConditional()) {
	Changed = true;
	DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
	// Remember if SinglePred was the entry block of the function.
	// If so, we will need to move BB back to the entry position.
	bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
	MergeBasicBlockIntoOnlyPred(BB, nullptr);

	if (isEntry && BB != &BB->getParent()->getEntryBlock())
	BB->moveBefore(&BB->getParent()->getEntryBlock());

	// We have erased a block. Update the iterator.
	I = BB->getIterator();
	}
	}
	return Changed;
	}

	/// Find a destination block from BB if BB is mergeable empty block.
	BasicBlock CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock BB) {
	// If this block doesn't end with an uncond branch, ignore it.
	BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
	if (!BI \|\| !BI->isUnconditional())
	return nullptr;

	// If the instruction before the branch (skipping debug info) isn't a phi
	// node, then other stuff is happening here.
	BasicBlock::iterator BBI = BI->getIterator();
	if (BBI != BB->begin()) {
	--BBI;
	while (isa<DbgInfoIntrinsic>(BBI)) {
	if (BBI == BB->begin())
	break;
	--BBI;
	}
	if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
	return nullptr;
	}

	// Do not break infinite loops.
	BasicBlock *DestBB = BI->getSuccessor(0);
	if (DestBB == BB)
	return nullptr;

	if (!canMergeBlocks(BB, DestBB))
	DestBB = nullptr;

	return DestBB;
	}

	/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
	/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
	/// edges in ways that are non-optimal for isel. Start by eliminating these
	/// blocks so we can split them the way we want them.
	bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
	SmallPtrSet<BasicBlock *, 16> Preheaders;
	SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
	while (!LoopList.empty()) {
	Loop *L = LoopList.pop_back_val();
	LoopList.insert(LoopList.end(), L->begin(), L->end());
	if (BasicBlock *Preheader = L->getLoopPreheader())
	Preheaders.insert(Preheader);
	}

	bool MadeChange = false;
	// Note that this intentionally skips the entry block.
	for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
	BasicBlock BB = &I++;
	BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
	if (!DestBB \|\|
	!isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
	continue;

	eliminateMostlyEmptyBlock(BB);
	MadeChange = true;
	}
	return MadeChange;
	}

	bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
	BasicBlock *DestBB,
	bool isPreheader) {
	// Do not delete loop preheaders if doing so would create a critical edge.
	// Loop preheaders can be good locations to spill registers. If the
	// preheader is deleted and we create a critical edge, registers may be
	// spilled in the loop body instead.
	if (!DisablePreheaderProtect && isPreheader &&
	!(BB->getSinglePredecessor() &&
	BB->getSinglePredecessor()->getSingleSuccessor()))
	return false;

	// Try to skip merging if the unique predecessor of BB is terminated by a
	// switch or indirect branch instruction, and BB is used as an incoming block
	// of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
	// add COPY instructions in the predecessor of BB instead of BB (if it is not
	// merged). Note that the critical edge created by merging such blocks wont be
	// split in MachineSink because the jump table is not analyzable. By keeping
	// such empty block (BB), ISel will place COPY instructions in BB, not in the
	// predecessor of BB.
	BasicBlock *Pred = BB->getUniquePredecessor();
	if (!Pred \|\|
	!(isa<SwitchInst>(Pred->getTerminator()) \|\|
	isa<IndirectBrInst>(Pred->getTerminator())))
	return true;

	if (BB->getTerminator() != BB->getFirstNonPHI())
	return true;

	// We use a simple cost heuristic which determine skipping merging is
	// profitable if the cost of skipping merging is less than the cost of
	// merging : Cost(skipping merging) < Cost(merging BB), where the
	// Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
	// the Cost(merging BB) is Freq(Pred) * Cost(Copy).
	// Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
	// Freq(Pred) / Freq(BB) > 2.
	// Note that if there are multiple empty blocks sharing the same incoming
	// value for the PHIs in the DestBB, we consider them together. In such
	// case, Cost(merging BB) will be the sum of their frequencies.

	if (!isa<PHINode>(DestBB->begin()))
	return true;

	SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;

	// Find all other incoming blocks from which incoming values of all PHIs in
	// DestBB are the same as the ones from BB.
	for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
	++PI) {
	BasicBlock DestBBPred = PI;
	if (DestBBPred == BB)
	continue;

	if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
	return DestPN.getIncomingValueForBlock(BB) ==
	DestPN.getIncomingValueForBlock(DestBBPred);
	}))
	SameIncomingValueBBs.insert(DestBBPred);
	}

	// See if all BB's incoming values are same as the value from Pred. In this
	// case, no reason to skip merging because COPYs are expected to be place in
	// Pred already.
	if (SameIncomingValueBBs.count(Pred))
	return true;

	BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
	BlockFrequency BBFreq = BFI->getBlockFreq(BB);

	for (auto SameValueBB : SameIncomingValueBBs)
	if (SameValueBB->getUniquePredecessor() == Pred &&
	DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
	BBFreq += BFI->getBlockFreq(SameValueBB);

	return PredFreq.getFrequency() <=
	BBFreq.getFrequency() * FreqRatioToSkipMerge;
	}

	/// Return true if we can merge BB into DestBB if there is a single
	/// unconditional branch between them, and BB contains no other non-phi
	/// instructions.
	bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
	const BasicBlock *DestBB) const {
	// We only want to eliminate blocks whose phi nodes are used by phi nodes in
	// the successor. If there are more complex condition (e.g. preheaders),
	// don't mess around with them.
	for (const PHINode &PN : BB->phis()) {
	for (const User *U : PN.users()) {
	const Instruction *UI = cast<Instruction>(U);
	if (UI->getParent() != DestBB \|\| !isa<PHINode>(UI))
	return false;
	// If User is inside DestBB block and it is a PHINode then check
	// incoming value. If incoming value is not from BB then this is
	// a complex condition (e.g. preheaders) we want to avoid here.
	if (UI->getParent() == DestBB) {
	if (const PHINode *UPN = dyn_cast<PHINode>(UI))
	for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
	Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
	if (Insn && Insn->getParent() == BB &&
	Insn->getParent() != UPN->getIncomingBlock(I))
	return false;
	}
	}
	}
	}

	// If BB and DestBB contain any common predecessors, then the phi nodes in BB
	// and DestBB may have conflicting incoming values for the block. If so, we
	// can't merge the block.
	const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
	if (!DestBBPN) return true; // no conflict.

	// Collect the preds of BB.
	SmallPtrSet<const BasicBlock*, 16> BBPreds;
	if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
	// It is faster to get preds from a PHI than with pred_iterator.
	for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
	BBPreds.insert(BBPN->getIncomingBlock(i));
	} else {
	BBPreds.insert(pred_begin(BB), pred_end(BB));
	}

	// Walk the preds of DestBB.
	for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
	BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
	if (BBPreds.count(Pred)) { // Common predecessor?
	for (const PHINode &PN : DestBB->phis()) {
	const Value *V1 = PN.getIncomingValueForBlock(Pred);
	const Value *V2 = PN.getIncomingValueForBlock(BB);

	// If V2 is a phi node in BB, look up what the mapped value will be.
	if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
	if (V2PN->getParent() == BB)
	V2 = V2PN->getIncomingValueForBlock(Pred);

	// If there is a conflict, bail out.
	if (V1 != V2) return false;
	}
	}
	}

	return true;
	}

	/// Eliminate a basic block that has only phi's and an unconditional branch in
	/// it.
	void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
	BranchInst *BI = cast<BranchInst>(BB->getTerminator());
	BasicBlock *DestBB = BI->getSuccessor(0);

	DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << BB << DestBB);

	// If the destination block has a single pred, then this is a trivial edge,
	// just collapse it.
	if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
	if (SinglePred != DestBB) {
	// Remember if SinglePred was the entry block of the function. If so, we
	// will need to move BB back to the entry position.
	bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
	MergeBasicBlockIntoOnlyPred(DestBB, nullptr);

	if (isEntry && BB != &BB->getParent()->getEntryBlock())
	BB->moveBefore(&BB->getParent()->getEntryBlock());

	DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
	return;
	}
	}

	// Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB
	// to handle the new incoming edges it is about to have.
	for (PHINode &PN : DestBB->phis()) {
	// Remove the incoming value for BB, and remember it.
	Value *InVal = PN.removeIncomingValue(BB, false);

	// Two options: either the InVal is a phi node defined in BB or it is some
	// value that dominates BB.
	PHINode *InValPhi = dyn_cast<PHINode>(InVal);
	if (InValPhi && InValPhi->getParent() == BB) {
	// Add all of the input values of the input PHI as inputs of this phi.
	for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
	PN.addIncoming(InValPhi->getIncomingValue(i),
	InValPhi->getIncomingBlock(i));
	} else {
	// Otherwise, add one instance of the dominating value for each edge that
	// we will be adding.
	if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
	for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
	PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
	} else {
	for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
	PN.addIncoming(InVal, *PI);
	}
	}
	}

	// The PHIs are now updated, change everything that refers to BB to use
	// DestBB and remove BB.
	BB->replaceAllUsesWith(DestBB);
	BB->eraseFromParent();
	++NumBlocksElim;

	DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
	}

	// Computes a map of base pointer relocation instructions to corresponding
	// derived pointer relocation instructions given a vector of all relocate calls
	static void computeBaseDerivedRelocateMap(
	const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
	DenseMap<GCRelocateInst , SmallVector<GCRelocateInst , 2>>
	&RelocateInstMap) {
	// Collect information in two maps: one primarily for locating the base object
	// while filling the second map; the second map is the final structure holding
	// a mapping between Base and corresponding Derived relocate calls
	DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
	for (auto *ThisRelocate : AllRelocateCalls) {
	auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
	ThisRelocate->getDerivedPtrIndex());
	RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
	}
	for (auto &Item : RelocateIdxMap) {
	std::pair<unsigned, unsigned> Key = Item.first;
	if (Key.first == Key.second)
	// Base relocation: nothing to insert
	continue;

	GCRelocateInst *I = Item.second;
	auto BaseKey = std::make_pair(Key.first, Key.first);

	// We're iterating over RelocateIdxMap so we cannot modify it.
	auto MaybeBase = RelocateIdxMap.find(BaseKey);
	if (MaybeBase == RelocateIdxMap.end())
	// TODO: We might want to insert a new base object relocate and gep off
	// that, if there are enough derived object relocates.
	continue;

	RelocateInstMap[MaybeBase->second].push_back(I);
	}
	}

	// Accepts a GEP and extracts the operands into a vector provided they're all
	// small integer constants
	static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
	SmallVectorImpl<Value *> &OffsetV) {
	for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
	// Only accept small constant integer operands
	auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
	if (!Op \|\| Op->getZExtValue() > 20)
	return false;
	}

	for (unsigned i = 1; i < GEP->getNumOperands(); i++)
	OffsetV.push_back(GEP->getOperand(i));
	return true;
	}

	// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
	// replace, computes a replacement, and affects it.
	static bool
	simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
	const SmallVectorImpl<GCRelocateInst *> &Targets) {
	bool MadeChange = false;
	// We must ensure the relocation of derived pointer is defined after
	// relocation of base pointer. If we find a relocation corresponding to base
	// defined earlier than relocation of base then we move relocation of base
	// right before found relocation. We consider only relocation in the same
	// basic block as relocation of base. Relocations from other basic block will
	// be skipped by optimization and we do not care about them.
	for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
	&*R != RelocatedBase; ++R)
	if (auto RI = dyn_cast<GCRelocateInst>(R))
	if (RI->getStatepoint() == RelocatedBase->getStatepoint())
	if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
	RelocatedBase->moveBefore(RI);
	break;
	}

	for (GCRelocateInst *ToReplace : Targets) {
	assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
	"Not relocating a derived object of the original base object");
	if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
	// A duplicate relocate call. TODO: coalesce duplicates.
	continue;
	}

	if (RelocatedBase->getParent() != ToReplace->getParent()) {
	// Base and derived relocates are in different basic blocks.
	// In this case transform is only valid when base dominates derived
	// relocate. However it would be too expensive to check dominance
	// for each such relocate, so we skip the whole transformation.
	continue;
	}

	Value *Base = ToReplace->getBasePtr();
	auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
	if (!Derived \|\| Derived->getPointerOperand() != Base)
	continue;

	SmallVector<Value *, 2> OffsetV;
	if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
	continue;

	// Create a Builder and replace the target callsite with a gep
	assert(RelocatedBase->getNextNode() &&
	"Should always have one since it's not a terminator");

	// Insert after RelocatedBase
	IRBuilder<> Builder(RelocatedBase->getNextNode());
	Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());

	// If gc_relocate does not match the actual type, cast it to the right type.
	// In theory, there must be a bitcast after gc_relocate if the type does not
	// match, and we should reuse it to get the derived pointer. But it could be
	// cases like this:
	// bb1:
	// ...
	// %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
	// br label %merge
	//
	// bb2:
	// ...
	// %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
	// br label %merge
	//
	// merge:
	// %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
	// %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
	//
	// In this case, we can not find the bitcast any more. So we insert a new bitcast
	// no matter there is already one or not. In this way, we can handle all cases, and
	// the extra bitcast should be optimized away in later passes.
	Value *ActualRelocatedBase = RelocatedBase;
	if (RelocatedBase->getType() != Base->getType()) {
	ActualRelocatedBase =
	Builder.CreateBitCast(RelocatedBase, Base->getType());
	}
	Value *Replacement = Builder.CreateGEP(
	Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
	Replacement->takeName(ToReplace);
	// If the newly generated derived pointer's type does not match the original derived
	// pointer's type, cast the new derived pointer to match it. Same reasoning as above.
	Value *ActualReplacement = Replacement;
	if (Replacement->getType() != ToReplace->getType()) {
	ActualReplacement =
	Builder.CreateBitCast(Replacement, ToReplace->getType());
	}
	ToReplace->replaceAllUsesWith(ActualReplacement);
	ToReplace->eraseFromParent();

	MadeChange = true;
	}
	return MadeChange;
	}

	// Turns this:
	//
	// %base = ...
	// %ptr = gep %base + 15
	// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
	// %base' = relocate(%tok, i32 4, i32 4)
	// %ptr' = relocate(%tok, i32 4, i32 5)
	// %val = load %ptr'
	//
	// into this:
	//
	// %base = ...
	// %ptr = gep %base + 15
	// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
	// %base' = gc.relocate(%tok, i32 4, i32 4)
	// %ptr' = gep %base' + 15
	// %val = load %ptr'
	bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
	bool MadeChange = false;
	SmallVector<GCRelocateInst *, 2> AllRelocateCalls;

	for (auto *U : I.users())
	if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
	// Collect all the relocate calls associated with a statepoint
	AllRelocateCalls.push_back(Relocate);

	// We need atleast one base pointer relocation + one derived pointer
	// relocation to mangle
	if (AllRelocateCalls.size() < 2)
	return false;

	// RelocateInstMap is a mapping from the base relocate instruction to the
	// corresponding derived relocate instructions
	DenseMap<GCRelocateInst , SmallVector<GCRelocateInst , 2>> RelocateInstMap;
	computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
	if (RelocateInstMap.empty())
	return false;

	for (auto &Item : RelocateInstMap)
	// Item.first is the RelocatedBase to offset against
	// Item.second is the vector of Targets to replace
	MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
	return MadeChange;
	}

	/// SinkCast - Sink the specified cast instruction into its user blocks
	static bool SinkCast(CastInst *CI) {
	BasicBlock *DefBB = CI->getParent();

	/// InsertedCasts - Only insert a cast in each block once.
	DenseMap<BasicBlock, CastInst> InsertedCasts;

	bool MadeChange = false;
	for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
	UI != E; ) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);

	// Figure out which BB this cast is used in. For PHI's this is the
	// appropriate predecessor block.
	BasicBlock *UserBB = User->getParent();
	if (PHINode *PN = dyn_cast<PHINode>(User)) {
	UserBB = PN->getIncomingBlock(TheUse);
	}

	// Preincrement use iterator so we don't invalidate it.
	++UI;

	// The first insertion point of a block containing an EH pad is after the
	// pad. If the pad is the user, we cannot sink the cast past the pad.
	if (User->isEHPad())
	continue;

	// If the block selected to receive the cast is an EH pad that does not
	// allow non-PHI instructions before the terminator, we can't sink the
	// cast.
	if (UserBB->getTerminator()->isEHPad())
	continue;

	// If this user is in the same block as the cast, don't change the cast.
	if (UserBB == DefBB) continue;

	// If we have already inserted a cast into this block, use it.
	CastInst *&InsertedCast = InsertedCasts[UserBB];

	if (!InsertedCast) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
	CI->getType(), "", &*InsertPt);
	}

	// Replace a use of the cast with a use of the new cast.
	TheUse = InsertedCast;
	MadeChange = true;
	++NumCastUses;
	}

	// If we removed all uses, nuke the cast.
	if (CI->use_empty()) {
	salvageDebugInfo(*CI);
	CI->eraseFromParent();
	MadeChange = true;
	}

	return MadeChange;
	}

	/// If the specified cast instruction is a noop copy (e.g. it's casting from
	/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
	/// reduce the number of virtual registers that must be created and coalesced.
	///
	/// Return true if any changes are made.
	static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
	const DataLayout &DL) {
	// Sink only "cheap" (or nop) address-space casts. This is a weaker condition
	// than sinking only nop casts, but is helpful on some platforms.
	if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
	if (!TLI.isCheapAddrSpaceCast(ASC->getSrcAddressSpace(),
	ASC->getDestAddressSpace()))
	return false;
	}

	// If this is a noop copy,
	EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, CI->getType());

	// This is an fp<->int conversion?
	if (SrcVT.isInteger() != DstVT.isInteger())
	return false;

	// If this is an extension, it will be a zero or sign extension, which
	// isn't a noop.
	if (SrcVT.bitsLT(DstVT)) return false;

	// If these values will be promoted, find out what they will be promoted
	// to. This helps us consider truncates on PPC as noop copies when they
	// are.
	if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
	TargetLowering::TypePromoteInteger)
	SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
	if (TLI.getTypeAction(CI->getContext(), DstVT) ==
	TargetLowering::TypePromoteInteger)
	DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);

	// If, after promotion, these are the same types, this is a noop copy.
	if (SrcVT != DstVT)
	return false;

	return SinkCast(CI);
	}

	/// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if
	/// possible.
	///
	/// Return true if any changes were made.
	static bool CombineUAddWithOverflow(CmpInst *CI) {
	Value A, B;
	Instruction *AddI;
	if (!match(CI,
	m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI))))
	return false;

	Type *Ty = AddI->getType();
	if (!isa<IntegerType>(Ty))
	return false;

	// We don't want to move around uses of condition values this late, so we we
	// check if it is legal to create the call to the intrinsic in the basic
	// block containing the icmp:

	if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse())
	return false;

	#ifndef NDEBUG
	// Someday m_UAddWithOverflow may get smarter, but this is a safe assumption
	// for now:
	if (AddI->hasOneUse())
	assert(*AddI->user_begin() == CI && "expected!");
	#endif

	Module *M = CI->getModule();
	Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);

	auto *InsertPt = AddI->hasOneUse() ? CI : AddI;

	auto *UAddWithOverflow =
	CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt);
	auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt);
	auto *Overflow =
	ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt);

	CI->replaceAllUsesWith(Overflow);
	AddI->replaceAllUsesWith(UAdd);
	CI->eraseFromParent();
	AddI->eraseFromParent();
	return true;
	}

	/// Sink the given CmpInst into user blocks to reduce the number of virtual
	/// registers that must be created and coalesced. This is a clear win except on
	/// targets with multiple condition code registers (PowerPC), where it might
	/// lose; some adjustment may be wanted there.
	///
	/// Return true if any changes are made.
	static bool SinkCmpExpression(CmpInst CI, const TargetLowering TLI) {
	BasicBlock *DefBB = CI->getParent();

	// Avoid sinking soft-FP comparisons, since this can move them into a loop.
	if (TLI && TLI->useSoftFloat() && isa<FCmpInst>(CI))
	return false;

	// Only insert a cmp in each block once.
	DenseMap<BasicBlock, CmpInst> InsertedCmps;

	bool MadeChange = false;
	for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
	UI != E; ) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);

	// Preincrement use iterator so we don't invalidate it.
	++UI;

	// Don't bother for PHI nodes.
	if (isa<PHINode>(User))
	continue;

	// Figure out which BB this cmp is used in.
	BasicBlock *UserBB = User->getParent();

	// If this user is in the same block as the cmp, don't change the cmp.
	if (UserBB == DefBB) continue;

	// If we have already inserted a cmp into this block, use it.
	CmpInst *&InsertedCmp = InsertedCmps[UserBB];

	if (!InsertedCmp) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedCmp =
	CmpInst::Create(CI->getOpcode(), CI->getPredicate(),
	CI->getOperand(0), CI->getOperand(1), "", &*InsertPt);
	// Propagate the debug info.
	InsertedCmp->setDebugLoc(CI->getDebugLoc());
	}

	// Replace a use of the cmp with a use of the new cmp.
	TheUse = InsertedCmp;
	MadeChange = true;
	++NumCmpUses;
	}

	// If we removed all uses, nuke the cmp.
	if (CI->use_empty()) {
	CI->eraseFromParent();
	MadeChange = true;
	}

	return MadeChange;
	}

	static bool OptimizeCmpExpression(CmpInst CI, const TargetLowering TLI) {
	if (SinkCmpExpression(CI, TLI))
	return true;

	if (CombineUAddWithOverflow(CI))
	return true;

	return false;
	}

	/// Duplicate and sink the given 'and' instruction into user blocks where it is
	/// used in a compare to allow isel to generate better code for targets where
	/// this operation can be combined.
	///
	/// Return true if any changes are made.
	static bool sinkAndCmp0Expression(Instruction *AndI,
	const TargetLowering &TLI,
	SetOfInstrs &InsertedInsts) {
	// Double-check that we're not trying to optimize an instruction that was
	// already optimized by some other part of this pass.
	assert(!InsertedInsts.count(AndI) &&
	"Attempting to optimize already optimized and instruction");
	(void) InsertedInsts;

	// Nothing to do for single use in same basic block.
	if (AndI->hasOneUse() &&
	AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
	return false;

	// Try to avoid cases where sinking/duplicating is likely to increase register
	// pressure.
	if (!isa<ConstantInt>(AndI->getOperand(0)) &&
	!isa<ConstantInt>(AndI->getOperand(1)) &&
	AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
	return false;

	for (auto *U : AndI->users()) {
	Instruction *User = cast<Instruction>(U);

	// Only sink for and mask feeding icmp with 0.
	if (!isa<ICmpInst>(User))
	return false;

	auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
	if (!CmpC \|\| !CmpC->isZero())
	return false;
	}

	if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
	return false;

	DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
	DEBUG(AndI->getParent()->dump());

	// Push the 'and' into the same block as the icmp 0. There should only be
	// one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
	// others, so we don't need to keep track of which BBs we insert into.
	for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
	UI != E; ) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);

	// Preincrement use iterator so we don't invalidate it.
	++UI;

	DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");

	// Keep the 'and' in the same place if the use is already in the same block.
	Instruction *InsertPt =
	User->getParent() == AndI->getParent() ? AndI : User;
	Instruction *InsertedAnd =
	BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
	AndI->getOperand(1), "", InsertPt);
	// Propagate the debug info.
	InsertedAnd->setDebugLoc(AndI->getDebugLoc());

	// Replace a use of the 'and' with a use of the new 'and'.
	TheUse = InsertedAnd;
	++NumAndUses;
	DEBUG(User->getParent()->dump());
	}

	// We removed all uses, nuke the and.
	AndI->eraseFromParent();
	return true;
	}

	/// Check if the candidates could be combined with a shift instruction, which
	/// includes:
	/// 1. Truncate instruction
	/// 2. And instruction and the imm is a mask of the low bits:
	/// imm & (imm+1) == 0
	static bool isExtractBitsCandidateUse(Instruction *User) {
	if (!isa<TruncInst>(User)) {
	if (User->getOpcode() != Instruction::And \|\|
	!isa<ConstantInt>(User->getOperand(1)))
	return false;

	const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();

	if ((Cimm & (Cimm + 1)).getBoolValue())
	return false;
	}
	return true;
	}

	/// Sink both shift and truncate instruction to the use of truncate's BB.
	static bool
	SinkShiftAndTruncate(BinaryOperator ShiftI, Instruction User, ConstantInt *CI,
	DenseMap<BasicBlock , BinaryOperator > &InsertedShifts,
	const TargetLowering &TLI, const DataLayout &DL) {
	BasicBlock *UserBB = User->getParent();
	DenseMap<BasicBlock , CastInst > InsertedTruncs;
	TruncInst *TruncI = dyn_cast<TruncInst>(User);
	bool MadeChange = false;

	for (Value::user_iterator TruncUI = TruncI->user_begin(),
	TruncE = TruncI->user_end();
	TruncUI != TruncE;) {

	Use &TruncTheUse = TruncUI.getUse();
	Instruction TruncUser = cast<Instruction>(TruncUI);
	// Preincrement use iterator so we don't invalidate it.

	++TruncUI;

	int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
	if (!ISDOpcode)
	continue;

	// If the use is actually a legal node, there will not be an
	// implicit truncate.
	// FIXME: always querying the result type is just an
	// approximation; some nodes' legality is determined by the
	// operand or other means. There's no good way to find out though.
	if (TLI.isOperationLegalOrCustom(
	ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
	continue;

	// Don't bother for PHI nodes.
	if (isa<PHINode>(TruncUser))
	continue;

	BasicBlock *TruncUserBB = TruncUser->getParent();

	if (UserBB == TruncUserBB)
	continue;

	BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
	CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];

	if (!InsertedShift && !InsertedTrunc) {
	BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
	assert(InsertPt != TruncUserBB->end());
	// Sink the shift
	if (ShiftI->getOpcode() == Instruction::AShr)
	InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);
	else
	InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);

	// Sink the trunc
	BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
	TruncInsertPt++;
	assert(TruncInsertPt != TruncUserBB->end());

	InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
	TruncI->getType(), "", &*TruncInsertPt);

	MadeChange = true;

	TruncTheUse = InsertedTrunc;
	}
	}
	return MadeChange;
	}

	/// Sink the shift right instruction into user blocks if the uses could
	/// potentially be combined with this shift instruction and generate BitExtract
	/// instruction. It will only be applied if the architecture supports BitExtract
	/// instruction. Here is an example:
	/// BB1:
	/// %x.extract.shift = lshr i64 %arg1, 32
	/// BB2:
	/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
	/// ==>
	///
	/// BB2:
	/// %x.extract.shift.1 = lshr i64 %arg1, 32
	/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
	///
	/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
	/// instruction.
	/// Return true if any changes are made.
	static bool OptimizeExtractBits(BinaryOperator ShiftI, ConstantInt CI,
	const TargetLowering &TLI,
	const DataLayout &DL) {
	BasicBlock *DefBB = ShiftI->getParent();

	/// Only insert instructions in each block once.
	DenseMap<BasicBlock , BinaryOperator > InsertedShifts;

	bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));

	bool MadeChange = false;
	for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
	UI != E;) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);
	// Preincrement use iterator so we don't invalidate it.
	++UI;

	// Don't bother for PHI nodes.
	if (isa<PHINode>(User))
	continue;

	if (!isExtractBitsCandidateUse(User))
	continue;

	BasicBlock *UserBB = User->getParent();

	if (UserBB == DefBB) {
	// If the shift and truncate instruction are in the same BB. The use of
	// the truncate(TruncUse) may still introduce another truncate if not
	// legal. In this case, we would like to sink both shift and truncate
	// instruction to the BB of TruncUse.
	// for example:
	// BB1:
	// i64 shift.result = lshr i64 opnd, imm
	// trunc.result = trunc shift.result to i16
	//
	// BB2:
	// ----> We will have an implicit truncate here if the architecture does
	// not have i16 compare.
	// cmp i16 trunc.result, opnd2
	//
	if (isa<TruncInst>(User) && shiftIsLegal
	// If the type of the truncate is legal, no trucate will be
	// introduced in other basic blocks.
	&&
	(!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
	MadeChange =
	SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);

	continue;
	}
	// If we have already inserted a shift into this block, use it.
	BinaryOperator *&InsertedShift = InsertedShifts[UserBB];

	if (!InsertedShift) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());

	if (ShiftI->getOpcode() == Instruction::AShr)
	InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);
	else
	InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);

	MadeChange = true;
	}

	// Replace a use of the shift with a use of the new shift.
	TheUse = InsertedShift;
	}

	// If we removed all uses, nuke the shift.
	if (ShiftI->use_empty())
	ShiftI->eraseFromParent();

	return MadeChange;
	}

	/// If counting leading or trailing zeros is an expensive operation and a zero
	/// input is defined, add a check for zero to avoid calling the intrinsic.
	///
	/// We want to transform:
	/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
	///
	/// into:
	/// entry:
	/// %cmpz = icmp eq i64 %A, 0
	/// br i1 %cmpz, label %cond.end, label %cond.false
	/// cond.false:
	/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
	/// br label %cond.end
	/// cond.end:
	/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
	///
	/// If the transform is performed, return true and set ModifiedDT to true.
	static bool despeculateCountZeros(IntrinsicInst *CountZeros,
	const TargetLowering *TLI,
	const DataLayout *DL,
	bool &ModifiedDT) {
	if (!TLI \|\| !DL)
	return false;

	// If a zero input is undefined, it doesn't make sense to despeculate that.
	if (match(CountZeros->getOperand(1), m_One()))
	return false;

	// If it's cheap to speculate, there's nothing to do.
	auto IntrinsicID = CountZeros->getIntrinsicID();
	if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) \|\|
	(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
	return false;

	// Only handle legal scalar cases. Anything else requires too much work.
	Type *Ty = CountZeros->getType();
	unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
	if (Ty->isVectorTy() \|\| SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
	return false;

	// The intrinsic will be sunk behind a compare against zero and branch.
	BasicBlock *StartBlock = CountZeros->getParent();
	BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");

	// Create another block after the count zero intrinsic. A PHI will be added
	// in this block to select the result of the intrinsic or the bit-width
	// constant if the input to the intrinsic is zero.
	BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
	BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");

	// Set up a builder to create a compare, conditional branch, and PHI.
	IRBuilder<> Builder(CountZeros->getContext());
	Builder.SetInsertPoint(StartBlock->getTerminator());
	Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());

	// Replace the unconditional branch that was created by the first split with
	// a compare against zero and a conditional branch.
	Value *Zero = Constant::getNullValue(Ty);
	Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
	Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
	StartBlock->getTerminator()->eraseFromParent();

	// Create a PHI in the end block to select either the output of the intrinsic
	// or the bit width of the operand.
	Builder.SetInsertPoint(&EndBlock->front());
	PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
	CountZeros->replaceAllUsesWith(PN);
	Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
	PN->addIncoming(BitWidth, StartBlock);
	PN->addIncoming(CountZeros, CallBlock);

	// We are explicitly handling the zero case, so we can set the intrinsic's
	// undefined zero argument to 'true'. This will also prevent reprocessing the
	// intrinsic; we only despeculate when a zero input is defined.
	CountZeros->setArgOperand(1, Builder.getTrue());
	ModifiedDT = true;
	return true;
	}

	bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
	BasicBlock *BB = CI->getParent();

	// Lower inline assembly if we can.
	// If we found an inline asm expession, and if the target knows how to
	// lower it to normal LLVM code, do so now.
	if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
	if (TLI->ExpandInlineAsm(CI)) {
	// Avoid invalidating the iterator.
	CurInstIterator = BB->begin();
	// Avoid processing instructions out of order, which could cause
	// reuse before a value is defined.
	SunkAddrs.clear();
	return true;
	}
	// Sink address computing for memory operands into the block.
	if (optimizeInlineAsmInst(CI))
	return true;
	}

	// Align the pointer arguments to this call if the target thinks it's a good
	// idea
	unsigned MinSize, PrefAlign;
	if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
	for (auto &Arg : CI->arg_operands()) {
	// We want to align both objects whose address is used directly and
	// objects whose address is used in casts and GEPs, though it only makes
	// sense for GEPs if the offset is a multiple of the desired alignment and
	// if size - offset meets the size threshold.
	if (!Arg->getType()->isPointerTy())
	continue;
	APInt Offset(DL->getPointerSizeInBits(
	cast<PointerType>(Arg->getType())->getAddressSpace()),
	0);
	Value Val = Arg->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
	uint64_t Offset2 = Offset.getLimitedValue();
	if ((Offset2 & (PrefAlign-1)) != 0)
	continue;
	AllocaInst *AI;
	if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
	DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
	AI->setAlignment(PrefAlign);
	// Global variables can only be aligned if they are defined in this
	// object (i.e. they are uniquely initialized in this object), and
	// over-aligning global variables that have an explicit section is
	// forbidden.
	GlobalVariable *GV;
	if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
	GV->getPointerAlignment(*DL) < PrefAlign &&
	DL->getTypeAllocSize(GV->getValueType()) >=
	MinSize + Offset2)
	GV->setAlignment(PrefAlign);
	}
	// If this is a memcpy (or similar) then we may be able to improve the
	// alignment
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
	unsigned Align = getKnownAlignment(MI->getDest(), *DL);
	if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
	Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
	if (Align > MI->getAlignment())
	MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
	}
	}

	// If we have a cold call site, try to sink addressing computation into the
	// cold block. This interacts with our handling for loads and stores to
	// ensure that we can fold all uses of a potential addressing computation
	// into their uses. TODO: generalize this to work over profiling data
	if (!OptSize && CI->hasFnAttr(Attribute::Cold))
	for (auto &Arg : CI->arg_operands()) {
	if (!Arg->getType()->isPointerTy())
	continue;
	unsigned AS = Arg->getType()->getPointerAddressSpace();
	return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
	}

	IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
	if (II) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::objectsize: {
	// Lower all uses of llvm.objectsize.*
	ConstantInt *RetVal =
	lowerObjectSizeCall(II, DL, TLInfo, /MustSucceed=*/true);
	// Substituting this can cause recursive simplifications, which can
	// invalidate our iterator. Use a WeakTrackingVH to hold onto it in case
	// this
	// happens.
	Value CurValue = &CurInstIterator;
	WeakTrackingVH IterHandle(CurValue);

	replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);

	// If the iterator instruction was recursively deleted, start over at the
	// start of the block.
	if (IterHandle != CurValue) {
	CurInstIterator = BB->begin();
	SunkAddrs.clear();
	}
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
	if (!ExtVal \|\| !ExtVal->hasOneUse() \|\|
	ExtVal->getParent() == CI->getParent())
	return false;
	// Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
	ExtVal->moveBefore(CI);
	// Mark this instruction as "inserted by CGP", so that other
	// optimizations don't touch it.
	InsertedInsts.insert(ExtVal);
	return true;
	}
	case Intrinsic::invariant_group_barrier:
	II->replaceAllUsesWith(II->getArgOperand(0));
	II->eraseFromParent();
	return true;

	case Intrinsic::cttz:
	case Intrinsic::ctlz:
	// If counting zeros is expensive, try to avoid it.
	return despeculateCountZeros(II, TLI, DL, ModifiedDT);
	}

	if (TLI) {
	SmallVector<Value*, 2> PtrOps;
	Type *AccessTy;
	if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
	while (!PtrOps.empty()) {
	Value *PtrVal = PtrOps.pop_back_val();
	unsigned AS = PtrVal->getType()->getPointerAddressSpace();
	if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
	return true;
	}
	}
	}

	// From here on out we're working with named functions.
	if (!CI->getCalledFunction()) return false;

	// Lower all default uses of _chk calls. This is very similar
	// to what InstCombineCalls does, but here we are only lowering calls
	// to fortified library functions (e.g. __memcpy_chk) that have the default
	// "don't know" as the objectsize. Anything else should be left alone.
	FortifiedLibCallSimplifier Simplifier(TLInfo, true);
	if (Value *V = Simplifier.optimizeCall(CI)) {
	CI->replaceAllUsesWith(V);
	CI->eraseFromParent();
	return true;
	}

	return false;
	}

	/// Look for opportunities to duplicate return instructions to the predecessor
	/// to enable tail call optimizations. The case it is currently looking for is:
	/// @code
	/// bb0:
	/// %tmp0 = tail call i32 @f0()
	/// br label %return
	/// bb1:
	/// %tmp1 = tail call i32 @f1()
	/// br label %return
	/// bb2:
	/// %tmp2 = tail call i32 @f2()
	/// br label %return
	/// return:
	/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
	/// ret i32 %retval
	/// @endcode
	///
	/// =>
	///
	/// @code
	/// bb0:
	/// %tmp0 = tail call i32 @f0()
	/// ret i32 %tmp0
	/// bb1:
	/// %tmp1 = tail call i32 @f1()
	/// ret i32 %tmp1
	/// bb2:
	/// %tmp2 = tail call i32 @f2()
	/// ret i32 %tmp2
	/// @endcode
	bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
	if (!TLI)
	return false;

	ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
	if (!RetI)
	return false;

	PHINode *PN = nullptr;
	BitCastInst *BCI = nullptr;
	Value *V = RetI->getReturnValue();
	if (V) {
	BCI = dyn_cast<BitCastInst>(V);
	if (BCI)
	V = BCI->getOperand(0);

	PN = dyn_cast<PHINode>(V);
	if (!PN)
	return false;
	}

	if (PN && PN->getParent() != BB)
	return false;

	// Make sure there are no instructions between the PHI and return, or that the
	// return is the first instruction in the block.
	if (PN) {
	BasicBlock::iterator BI = BB->begin();
	do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
	if (&*BI == BCI)
	// Also skip over the bitcast.
	++BI;
	if (&*BI != RetI)
	return false;
	} else {
	BasicBlock::iterator BI = BB->begin();
	while (isa<DbgInfoIntrinsic>(BI)) ++BI;
	if (&*BI != RetI)
	return false;
	}

	/// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
	/// call.
	const Function *F = BB->getParent();
	SmallVector<CallInst*, 4> TailCalls;
	if (PN) {
	for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
	CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I));
	// Make sure the phi value is indeed produced by the tail call.
	if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
	TLI->mayBeEmittedAsTailCall(CI) &&
	attributesPermitTailCall(F, CI, RetI, *TLI))
	TailCalls.push_back(CI);
	}
	} else {
	SmallPtrSet<BasicBlock*, 4> VisitedBBs;
	for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
	if (!VisitedBBs.insert(*PI).second)
	continue;

	BasicBlock::InstListType &InstList = (*PI)->getInstList();
	BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
	BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
	do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
	if (RI == RE)
	continue;

	CallInst CI = dyn_cast<CallInst>(&RI);
	if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
	attributesPermitTailCall(F, CI, RetI, *TLI))
	TailCalls.push_back(CI);
	}
	}

	bool Changed = false;
	for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
	CallInst *CI = TailCalls[i];
	CallSite CS(CI);

	// Conservatively require the attributes of the call to match those of the
	// return. Ignore noalias because it doesn't affect the call sequence.
	AttributeList CalleeAttrs = CS.getAttributes();
	if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
	.removeAttribute(Attribute::NoAlias) !=
	AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
	.removeAttribute(Attribute::NoAlias))
	continue;

	// Make sure the call instruction is followed by an unconditional branch to
	// the return block.
	BasicBlock *CallBB = CI->getParent();
	BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
	if (!BI \|\| !BI->isUnconditional() \|\| BI->getSuccessor(0) != BB)
	continue;

	// Duplicate the return into CallBB.
	(void)FoldReturnIntoUncondBranch(RetI, BB, CallBB);
	ModifiedDT = Changed = true;
	++NumRetsDup;
	}

	// If we eliminated all predecessors of the block, delete the block now.
	if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
	BB->eraseFromParent();

	return Changed;
	}

	//===----------------------------------------------------------------------===//
	// Memory Optimization
	//===----------------------------------------------------------------------===//

	namespace {

	/// This is an extended version of TargetLowering::AddrMode
	/// which holds actual Value*'s for register values.
	struct ExtAddrMode : public TargetLowering::AddrMode {
	Value *BaseReg = nullptr;
	Value *ScaledReg = nullptr;
	Value *OriginalValue = nullptr;

	enum FieldName {
	NoField = 0x00,
	BaseRegField = 0x01,
	BaseGVField = 0x02,
	BaseOffsField = 0x04,
	ScaledRegField = 0x08,
	ScaleField = 0x10,
	MultipleFields = 0xff
	};

	ExtAddrMode() = default;

	void print(raw_ostream &OS) const;
	void dump() const;

	FieldName compare(const ExtAddrMode &other) {
	// First check that the types are the same on each field, as differing types
	// is something we can't cope with later on.
	if (BaseReg && other.BaseReg &&
	BaseReg->getType() != other.BaseReg->getType())
	return MultipleFields;
	if (BaseGV && other.BaseGV &&
	BaseGV->getType() != other.BaseGV->getType())
	return MultipleFields;
	if (ScaledReg && other.ScaledReg &&
	ScaledReg->getType() != other.ScaledReg->getType())
	return MultipleFields;

	// Check each field to see if it differs.
	unsigned Result = NoField;
	if (BaseReg != other.BaseReg)
	Result \|= BaseRegField;
	if (BaseGV != other.BaseGV)
	Result \|= BaseGVField;
	if (BaseOffs != other.BaseOffs)
	Result \|= BaseOffsField;
	if (ScaledReg != other.ScaledReg)
	Result \|= ScaledRegField;
	// Don't count 0 as being a different scale, because that actually means
	// unscaled (which will already be counted by having no ScaledReg).
	if (Scale && other.Scale && Scale != other.Scale)
	Result \|= ScaleField;

	if (countPopulation(Result) > 1)
	return MultipleFields;
	else
	return static_cast<FieldName>(Result);
	}

	// An AddrMode is trivial if it involves no calculation i.e. it is just a base
	// with no offset.
	bool isTrivial() {
	// An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
	// trivial if at most one of these terms is nonzero, except that BaseGV and
	// BaseReg both being zero actually means a null pointer value, which we
	// consider to be 'non-zero' here.
	return !BaseOffs && !Scale && !(BaseGV && BaseReg);
	}

	Value GetFieldAsValue(FieldName Field, Type IntPtrTy) {
	switch (Field) {
	default:
	return nullptr;
	case BaseRegField:
	return BaseReg;
	case BaseGVField:
	return BaseGV;
	case ScaledRegField:
	return ScaledReg;
	case BaseOffsField:
	return ConstantInt::get(IntPtrTy, BaseOffs);
	}
	}

	void SetCombinedField(FieldName Field, Value *V,
	const SmallVectorImpl<ExtAddrMode> &AddrModes) {
	switch (Field) {
	default:
	llvm_unreachable("Unhandled fields are expected to be rejected earlier");
	break;
	case ExtAddrMode::BaseRegField:
	BaseReg = V;
	break;
	case ExtAddrMode::BaseGVField:
	// A combined BaseGV is an Instruction, not a GlobalValue, so it goes
	// in the BaseReg field.
	assert(BaseReg == nullptr);
	BaseReg = V;
	BaseGV = nullptr;
	break;
	case ExtAddrMode::ScaledRegField:
	ScaledReg = V;
	// If we have a mix of scaled and unscaled addrmodes then we want scale
	// to be the scale and not zero.
	if (!Scale)
	for (const ExtAddrMode &AM : AddrModes)
	if (AM.Scale) {
	Scale = AM.Scale;
	break;
	}
	break;
	case ExtAddrMode::BaseOffsField:
	// The offset is no longer a constant, so it goes in ScaledReg with a
	// scale of 1.
	assert(ScaledReg == nullptr);
	ScaledReg = V;
	Scale = 1;
	BaseOffs = 0;
	break;
	}
	}
	};

	} // end anonymous namespace

	#ifndef NDEBUG
	static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
	AM.print(OS);
	return OS;
	}
	#endif

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void ExtAddrMode::print(raw_ostream &OS) const {
	bool NeedPlus = false;
	OS << "[";
	if (BaseGV) {
	OS << (NeedPlus ? " + " : "")
	<< "GV:";
	BaseGV->printAsOperand(OS, /PrintType=/false);
	NeedPlus = true;
	}

	if (BaseOffs) {
	OS << (NeedPlus ? " + " : "")
	<< BaseOffs;
	NeedPlus = true;
	}

	if (BaseReg) {
	OS << (NeedPlus ? " + " : "")
	<< "Base:";
	BaseReg->printAsOperand(OS, /PrintType=/false);
	NeedPlus = true;
	}
	if (Scale) {
	OS << (NeedPlus ? " + " : "")
	<< Scale << "*";
	ScaledReg->printAsOperand(OS, /PrintType=/false);
	}

	OS << ']';
	}

	LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
	print(dbgs());
	dbgs() << '\n';
	}
	#endif

	namespace {

	/// \brief This class provides transaction based operation on the IR.
	/// Every change made through this class is recorded in the internal state and
	/// can be undone (rollback) until commit is called.
	class TypePromotionTransaction {
	/// \brief This represents the common interface of the individual transaction.
	/// Each class implements the logic for doing one specific modification on
	/// the IR via the TypePromotionTransaction.
	class TypePromotionAction {
	protected:
	/// The Instruction modified.
	Instruction *Inst;

	public:
	/// \brief Constructor of the action.
	/// The constructor performs the related action on the IR.
	TypePromotionAction(Instruction *Inst) : Inst(Inst) {}

	virtual ~TypePromotionAction() = default;

	/// \brief Undo the modification done by this action.
	/// When this method is called, the IR must be in the same state as it was
	/// before this action was applied.
	/// \pre Undoing the action works if and only if the IR is in the exact same
	/// state as it was directly after this action was applied.
	virtual void undo() = 0;

	/// \brief Advocate every change made by this action.
	/// When the results on the IR of the action are to be kept, it is important
	/// to call this function, otherwise hidden information may be kept forever.
	virtual void commit() {
	// Nothing to be done, this action is not doing anything.
	}
	};

	/// \brief Utility to remember the position of an instruction.
	class InsertionHandler {
	/// Position of an instruction.
	/// Either an instruction:
	/// - Is the first in a basic block: BB is used.
	/// - Has a previous instructon: PrevInst is used.
	union {
	Instruction *PrevInst;
	BasicBlock *BB;
	} Point;

	/// Remember whether or not the instruction had a previous instruction.
	bool HasPrevInstruction;

	public:
	/// \brief Record the position of \p Inst.
	InsertionHandler(Instruction *Inst) {
	BasicBlock::iterator It = Inst->getIterator();
	HasPrevInstruction = (It != (Inst->getParent()->begin()));
	if (HasPrevInstruction)
	Point.PrevInst = &*--It;
	else
	Point.BB = Inst->getParent();
	}

	/// \brief Insert \p Inst at the recorded position.
	void insert(Instruction *Inst) {
	if (HasPrevInstruction) {
	if (Inst->getParent())
	Inst->removeFromParent();
	Inst->insertAfter(Point.PrevInst);
	} else {
	Instruction Position = &Point.BB->getFirstInsertionPt();
	if (Inst->getParent())
	Inst->moveBefore(Position);
	else
	Inst->insertBefore(Position);
	}
	}
	};

	/// \brief Move an instruction before another.
	class InstructionMoveBefore : public TypePromotionAction {
	/// Original position of the instruction.
	InsertionHandler Position;

	public:
	/// \brief Move \p Inst before \p Before.
	InstructionMoveBefore(Instruction Inst, Instruction Before)
	: TypePromotionAction(Inst), Position(Inst) {
	DEBUG(dbgs() << "Do: move: " << Inst << "\nbefore: " << Before << "\n");
	Inst->moveBefore(Before);
	}

	/// \brief Move the instruction back to its original position.
	void undo() override {
	DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
	Position.insert(Inst);
	}
	};

	/// \brief Set the operand of an instruction with a new value.
	class OperandSetter : public TypePromotionAction {
	/// Original operand of the instruction.
	Value *Origin;

	/// Index of the modified instruction.
	unsigned Idx;

	public:
	/// \brief Set \p Idx operand of \p Inst with \p NewVal.
	OperandSetter(Instruction Inst, unsigned Idx, Value NewVal)
	: TypePromotionAction(Inst), Idx(Idx) {
	DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
	<< "for:" << *Inst << "\n"
	<< "with:" << *NewVal << "\n");
	Origin = Inst->getOperand(Idx);
	Inst->setOperand(Idx, NewVal);
	}

	/// \brief Restore the original value of the instruction.
	void undo() override {
	DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
	<< "for: " << *Inst << "\n"
	<< "with: " << *Origin << "\n");
	Inst->setOperand(Idx, Origin);
	}
	};

	/// \brief Hide the operands of an instruction.
	/// Do as if this instruction was not using any of its operands.
	class OperandsHider : public TypePromotionAction {
	/// The list of original operands.
	SmallVector<Value *, 4> OriginalValues;

	public:
	/// \brief Remove \p Inst from the uses of the operands of \p Inst.
	OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
	DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
	unsigned NumOpnds = Inst->getNumOperands();
	OriginalValues.reserve(NumOpnds);
	for (unsigned It = 0; It < NumOpnds; ++It) {
	// Save the current operand.
	Value *Val = Inst->getOperand(It);
	OriginalValues.push_back(Val);
	// Set a dummy one.
	// We could use OperandSetter here, but that would imply an overhead
	// that we are not willing to pay.
	Inst->setOperand(It, UndefValue::get(Val->getType()));
	}
	}

	/// \brief Restore the original list of uses.
	void undo() override {
	DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
	for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
	Inst->setOperand(It, OriginalValues[It]);
	}
	};

	/// \brief Build a truncate instruction.
	class TruncBuilder : public TypePromotionAction {
	Value *Val;

	public:
	/// \brief Build a truncate instruction of \p Opnd producing a \p Ty
	/// result.
	/// trunc Opnd to Ty.
	TruncBuilder(Instruction Opnd, Type Ty) : TypePromotionAction(Opnd) {
	IRBuilder<> Builder(Opnd);
	Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
	DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
	}

	/// \brief Get the built value.
	Value *getBuiltValue() { return Val; }

	/// \brief Remove the built instruction.
	void undo() override {
	DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
	if (Instruction *IVal = dyn_cast<Instruction>(Val))
	IVal->eraseFromParent();
	}
	};

	/// \brief Build a sign extension instruction.
	class SExtBuilder : public TypePromotionAction {
	Value *Val;

	public:
	/// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
	/// result.
	/// sext Opnd to Ty.
	SExtBuilder(Instruction InsertPt, Value Opnd, Type *Ty)
	: TypePromotionAction(InsertPt) {
	IRBuilder<> Builder(InsertPt);
	Val = Builder.CreateSExt(Opnd, Ty, "promoted");
	DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
	}

	/// \brief Get the built value.
	Value *getBuiltValue() { return Val; }

	/// \brief Remove the built instruction.
	void undo() override {
	DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
	if (Instruction *IVal = dyn_cast<Instruction>(Val))
	IVal->eraseFromParent();
	}
	};

	/// \brief Build a zero extension instruction.
	class ZExtBuilder : public TypePromotionAction {
	Value *Val;

	public:
	/// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
	/// result.
	/// zext Opnd to Ty.
	ZExtBuilder(Instruction InsertPt, Value Opnd, Type *Ty)
	: TypePromotionAction(InsertPt) {
	IRBuilder<> Builder(InsertPt);
	Val = Builder.CreateZExt(Opnd, Ty, "promoted");
	DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
	}

	/// \brief Get the built value.
	Value *getBuiltValue() { return Val; }

	/// \brief Remove the built instruction.
	void undo() override {
	DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
	if (Instruction *IVal = dyn_cast<Instruction>(Val))
	IVal->eraseFromParent();
	}
	};

	/// \brief Mutate an instruction to another type.
	class TypeMutator : public TypePromotionAction {
	/// Record the original type.
	Type *OrigTy;

	public:
	/// \brief Mutate the type of \p Inst into \p NewTy.
	TypeMutator(Instruction Inst, Type NewTy)
	: TypePromotionAction(Inst), OrigTy(Inst->getType()) {
	DEBUG(dbgs() << "Do: MutateType: " << Inst << " with " << NewTy
	<< "\n");
	Inst->mutateType(NewTy);
	}

	/// \brief Mutate the instruction back to its original type.
	void undo() override {
	DEBUG(dbgs() << "Undo: MutateType: " << Inst << " with " << OrigTy
	<< "\n");
	Inst->mutateType(OrigTy);
	}
	};

	/// \brief Replace the uses of an instruction by another instruction.
	class UsesReplacer : public TypePromotionAction {
	/// Helper structure to keep track of the replaced uses.
	struct InstructionAndIdx {
	/// The instruction using the instruction.
	Instruction *Inst;

	/// The index where this instruction is used for Inst.
	unsigned Idx;

	InstructionAndIdx(Instruction *Inst, unsigned Idx)
	: Inst(Inst), Idx(Idx) {}
	};

	/// Keep track of the original uses (pair Instruction, Index).
	SmallVector<InstructionAndIdx, 4> OriginalUses;

	using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;

	public:
	/// \brief Replace all the use of \p Inst by \p New.
	UsesReplacer(Instruction Inst, Value New) : TypePromotionAction(Inst) {
	DEBUG(dbgs() << "Do: UsersReplacer: " << Inst << " with " << New
	<< "\n");
	// Record the original uses.
	for (Use &U : Inst->uses()) {
	Instruction *UserI = cast<Instruction>(U.getUser());
	OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
	}
	// Now, we can replace the uses.
	Inst->replaceAllUsesWith(New);
	}

	/// \brief Reassign the original uses of Inst to Inst.
	void undo() override {
	DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
	for (use_iterator UseIt = OriginalUses.begin(),
	EndIt = OriginalUses.end();
	UseIt != EndIt; ++UseIt) {
	UseIt->Inst->setOperand(UseIt->Idx, Inst);
	}
	}
	};

	/// \brief Remove an instruction from the IR.
	class InstructionRemover : public TypePromotionAction {
	/// Original position of the instruction.
	InsertionHandler Inserter;

	/// Helper structure to hide all the link to the instruction. In other
	/// words, this helps to do as if the instruction was removed.
	OperandsHider Hider;

	/// Keep track of the uses replaced, if any.
	UsesReplacer *Replacer = nullptr;

	/// Keep track of instructions removed.
	SetOfInstrs &RemovedInsts;

	public:
	/// \brief Remove all reference of \p Inst and optinally replace all its
	/// uses with New.
	/// \p RemovedInsts Keep track of the instructions removed by this Action.
	/// \pre If !Inst->use_empty(), then New != nullptr
	InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
	Value *New = nullptr)
	: TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
	RemovedInsts(RemovedInsts) {
	if (New)
	Replacer = new UsesReplacer(Inst, New);
	DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
	RemovedInsts.insert(Inst);
	/// The instructions removed here will be freed after completing
	/// optimizeBlock() for all blocks as we need to keep track of the
	/// removed instructions during promotion.
	Inst->removeFromParent();
	}

	~InstructionRemover() override { delete Replacer; }

	/// \brief Resurrect the instruction and reassign it to the proper uses if
	/// new value was provided when build this action.
	void undo() override {
	DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
	Inserter.insert(Inst);
	if (Replacer)
	Replacer->undo();
	Hider.undo();
	RemovedInsts.erase(Inst);
	}
	};

	public:
	/// Restoration point.
	/// The restoration point is a pointer to an action instead of an iterator
	/// because the iterator may be invalidated but not the pointer.
	using ConstRestorationPt = const TypePromotionAction *;

	TypePromotionTransaction(SetOfInstrs &RemovedInsts)
	: RemovedInsts(RemovedInsts) {}

	/// Advocate every changes made in that transaction.
	void commit();

	/// Undo all the changes made after the given point.
	void rollback(ConstRestorationPt Point);

	/// Get the current restoration point.
	ConstRestorationPt getRestorationPoint() const;

	/// \name API for IR modification with state keeping to support rollback.
	/// @{
	/// Same as Instruction::setOperand.
	void setOperand(Instruction Inst, unsigned Idx, Value NewVal);

	/// Same as Instruction::eraseFromParent.
	void eraseInstruction(Instruction Inst, Value NewVal = nullptr);

	/// Same as Value::replaceAllUsesWith.
	void replaceAllUsesWith(Instruction Inst, Value New);

	/// Same as Value::mutateType.
	void mutateType(Instruction Inst, Type NewTy);

	/// Same as IRBuilder::createTrunc.
	Value createTrunc(Instruction Opnd, Type *Ty);

	/// Same as IRBuilder::createSExt.
	Value createSExt(Instruction Inst, Value Opnd, Type Ty);

	/// Same as IRBuilder::createZExt.
	Value createZExt(Instruction Inst, Value Opnd, Type Ty);

	/// Same as Instruction::moveBefore.
	void moveBefore(Instruction Inst, Instruction Before);
	/// @}

	private:
	/// The ordered list of actions made so far.
	SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;

	using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;

	SetOfInstrs &RemovedInsts;
	};

	} // end anonymous namespace

	void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
	Value *NewVal) {
	Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>(
	Inst, Idx, NewVal));
	}

	void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
	Value *NewVal) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::InstructionRemover>(
	Inst, RemovedInsts, NewVal));
	}

	void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
	Value *New) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
	}

	void TypePromotionTransaction::mutateType(Instruction Inst, Type NewTy) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
	}

	Value TypePromotionTransaction::createTrunc(Instruction Opnd,
	Type *Ty) {
	std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
	Value *Val = Ptr->getBuiltValue();
	Actions.push_back(std::move(Ptr));
	return Val;
	}

	Value TypePromotionTransaction::createSExt(Instruction Inst,
	Value Opnd, Type Ty) {
	std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
	Value *Val = Ptr->getBuiltValue();
	Actions.push_back(std::move(Ptr));
	return Val;
	}

	Value TypePromotionTransaction::createZExt(Instruction Inst,
	Value Opnd, Type Ty) {
	std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
	Value *Val = Ptr->getBuiltValue();
	Actions.push_back(std::move(Ptr));
	return Val;
	}

	void TypePromotionTransaction::moveBefore(Instruction *Inst,
	Instruction *Before) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
	Inst, Before));
	}

	TypePromotionTransaction::ConstRestorationPt
	TypePromotionTransaction::getRestorationPoint() const {
	return !Actions.empty() ? Actions.back().get() : nullptr;
	}

	void TypePromotionTransaction::commit() {
	for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
	++It)
	(*It)->commit();
	Actions.clear();
	}

	void TypePromotionTransaction::rollback(
	TypePromotionTransaction::ConstRestorationPt Point) {
	while (!Actions.empty() && Point != Actions.back().get()) {
	std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
	Curr->undo();
	}
	}

	namespace {

	/// \brief A helper class for matching addressing modes.
	///
	/// This encapsulates the logic for matching the target-legal addressing modes.
	class AddressingModeMatcher {
	SmallVectorImpl<Instruction*> &AddrModeInsts;
	const TargetLowering &TLI;
	const TargetRegisterInfo &TRI;
	const DataLayout &DL;

	/// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
	/// the memory instruction that we're computing this address for.
	Type *AccessTy;
	unsigned AddrSpace;
	Instruction *MemoryInst;

	/// This is the addressing mode that we're building up. This is
	/// part of the return value of this addressing mode matching stuff.
	ExtAddrMode &AddrMode;

	/// The instructions inserted by other CodeGenPrepare optimizations.
	const SetOfInstrs &InsertedInsts;

	/// A map from the instructions to their type before promotion.
	InstrToOrigTy &PromotedInsts;

	/// The ongoing transaction where every action should be registered.
	TypePromotionTransaction &TPT;

	/// This is set to true when we should not do profitability checks.
	/// When true, IsProfitableToFoldIntoAddressingMode always returns true.
	bool IgnoreProfitability;

	AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
	const TargetLowering &TLI,
	const TargetRegisterInfo &TRI,
	Type *AT, unsigned AS,
	Instruction *MI, ExtAddrMode &AM,
	const SetOfInstrs &InsertedInsts,
	InstrToOrigTy &PromotedInsts,
	TypePromotionTransaction &TPT)
	: AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
	DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
	MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
	PromotedInsts(PromotedInsts), TPT(TPT) {
	IgnoreProfitability = false;
	}

	public:
	/// Find the maximal addressing mode that a load/store of V can fold,
	/// give an access type of AccessTy. This returns a list of involved
	/// instructions in AddrModeInsts.
	/// \p InsertedInsts The instructions inserted by other CodeGenPrepare
	/// optimizations.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	/// \p The ongoing transaction where every action should be registered.
	static ExtAddrMode Match(Value V, Type AccessTy, unsigned AS,
	Instruction *MemoryInst,
	SmallVectorImpl<Instruction*> &AddrModeInsts,
	const TargetLowering &TLI,
	const TargetRegisterInfo &TRI,
	const SetOfInstrs &InsertedInsts,
	InstrToOrigTy &PromotedInsts,
	TypePromotionTransaction &TPT) {
	ExtAddrMode Result;

	bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI,
	AccessTy, AS,
	MemoryInst, Result, InsertedInsts,
	PromotedInsts, TPT).matchAddr(V, 0);
	(void)Success; assert(Success && "Couldn't select anything?");
	return Result;
	}

	private:
	bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
	bool matchAddr(Value *V, unsigned Depth);
	bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
	bool *MovedAway = nullptr);
	bool isProfitableToFoldIntoAddressingMode(Instruction *I,
	ExtAddrMode &AMBefore,
	ExtAddrMode &AMAfter);
	bool valueAlreadyLiveAtInst(Value Val, Value KnownLive1, Value *KnownLive2);
	bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
	Value *PromotedOperand) const;
	};

	/// \brief Keep track of simplification of Phi nodes.
	/// Accept the set of all phi nodes and erase phi node from this set
	/// if it is simplified.
	class SimplificationTracker {
	DenseMap<Value , Value > Storage;
	const SimplifyQuery &SQ;
	SmallPtrSetImpl<PHINode *> &AllPhiNodes;
	SmallPtrSetImpl<SelectInst *> &AllSelectNodes;

	public:
	SimplificationTracker(const SimplifyQuery &sq,
	SmallPtrSetImpl<PHINode *> &APN,
	SmallPtrSetImpl<SelectInst *> &ASN)
	: SQ(sq), AllPhiNodes(APN), AllSelectNodes(ASN) {}

	Value Get(Value V) {
	do {
	auto SV = Storage.find(V);
	if (SV == Storage.end())
	return V;
	V = SV->second;
	} while (true);
	}

	Value Simplify(Value Val) {
	SmallVector<Value *, 32> WorkList;
	SmallPtrSet<Value *, 32> Visited;
	WorkList.push_back(Val);
	while (!WorkList.empty()) {
	auto P = WorkList.pop_back_val();
	if (!Visited.insert(P).second)
	continue;
	if (auto *PI = dyn_cast<Instruction>(P))
	if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
	for (auto *U : PI->users())
	WorkList.push_back(cast<Value>(U));
	Put(PI, V);
	PI->replaceAllUsesWith(V);
	if (auto *PHI = dyn_cast<PHINode>(PI))
	AllPhiNodes.erase(PHI);
	if (auto *Select = dyn_cast<SelectInst>(PI))
	AllSelectNodes.erase(Select);
	PI->eraseFromParent();
	}
	}
	return Get(Val);
	}

	void Put(Value From, Value To) {
	Storage.insert({ From, To });
	}
	};

	/// \brief A helper class for combining addressing modes.
	class AddressingModeCombiner {
	typedef std::pair<Value , BasicBlock > ValueInBB;
	typedef DenseMap<ValueInBB, Value *> FoldAddrToValueMapping;
	typedef std::pair<PHINode , PHINode > PHIPair;

	private:
	/// The addressing modes we've collected.
	SmallVector<ExtAddrMode, 16> AddrModes;

	/// The field in which the AddrModes differ, when we have more than one.
	ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;

	/// Are the AddrModes that we have all just equal to their original values?
	bool AllAddrModesTrivial = true;

	/// Common Type for all different fields in addressing modes.
	Type *CommonType;

	/// SimplifyQuery for simplifyInstruction utility.
	const SimplifyQuery &SQ;

	/// Original Address.
	ValueInBB Original;

	public:
	AddressingModeCombiner(const SimplifyQuery &_SQ, ValueInBB OriginalValue)
	: CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}

	/// \brief Get the combined AddrMode
	const ExtAddrMode &getAddrMode() const {
	return AddrModes[0];
	}

	/// \brief Add a new AddrMode if it's compatible with the AddrModes we already
	/// have.
	/// \return True iff we succeeded in doing so.
	bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
	// Take note of if we have any non-trivial AddrModes, as we need to detect
	// when all AddrModes are trivial as then we would introduce a phi or select
	// which just duplicates what's already there.
	AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();

	// If this is the first addrmode then everything is fine.
	if (AddrModes.empty()) {
	AddrModes.emplace_back(NewAddrMode);
	return true;
	}

	// Figure out how different this is from the other address modes, which we
	// can do just by comparing against the first one given that we only care
	// about the cumulative difference.
	ExtAddrMode::FieldName ThisDifferentField =
	AddrModes[0].compare(NewAddrMode);
	if (DifferentField == ExtAddrMode::NoField)
	DifferentField = ThisDifferentField;
	else if (DifferentField != ThisDifferentField)
	DifferentField = ExtAddrMode::MultipleFields;

	// If NewAddrMode differs in only one dimension, and that dimension isn't
	// the amount that ScaledReg is scaled by, then we can handle it by
	// inserting a phi/select later on. Even if NewAddMode is the same
	// we still need to collect it due to original value is different.
	// And later we will need all original values as anchors during
	// finding the common Phi node.
	+ // We also must reject the case when base offset is different and
	+ // scale reg is not null, we cannot handle this case due to merge of
	+ // different offsets will be used as ScaleReg.
	if (DifferentField != ExtAddrMode::MultipleFields &&
	- DifferentField != ExtAddrMode::ScaleField) {
	+ DifferentField != ExtAddrMode::ScaleField &&
	+ (DifferentField != ExtAddrMode::BaseOffsField \|\|
	+ !NewAddrMode.ScaledReg)) {
	AddrModes.emplace_back(NewAddrMode);
	return true;
	}

	// We couldn't combine NewAddrMode with the rest, so return failure.
	AddrModes.clear();
	return false;
	}

	/// \brief Combine the addressing modes we've collected into a single
	/// addressing mode.
	/// \return True iff we successfully combined them or we only had one so
	/// didn't need to combine them anyway.
	bool combineAddrModes() {
	// If we have no AddrModes then they can't be combined.
	if (AddrModes.size() == 0)
	return false;

	// A single AddrMode can trivially be combined.
	if (AddrModes.size() == 1 \|\| DifferentField == ExtAddrMode::NoField)
	return true;

	// If the AddrModes we collected are all just equal to the value they are
	// derived from then combining them wouldn't do anything useful.
	if (AllAddrModesTrivial)
	return false;

	if (!addrModeCombiningAllowed())
	return false;

	// Build a map between <original value, basic block where we saw it> to
	// value of base register.
	// Bail out if there is no common type.
	FoldAddrToValueMapping Map;
	if (!initializeMap(Map))
	return false;

	Value *CommonValue = findCommon(Map);
	if (CommonValue)
	AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
	return CommonValue != nullptr;
	}

	private:
	/// \brief Initialize Map with anchor values. For address seen in some BB
	/// we set the value of different field saw in this address.
	/// If address is not an instruction than basic block is set to null.
	/// At the same time we find a common type for different field we will
	/// use to create new Phi/Select nodes. Keep it in CommonType field.
	/// Return false if there is no common type found.
	bool initializeMap(FoldAddrToValueMapping &Map) {
	// Keep track of keys where the value is null. We will need to replace it
	// with constant null when we know the common type.
	SmallVector<ValueInBB, 2> NullValue;
	Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
	for (auto &AM : AddrModes) {
	BasicBlock *BB = nullptr;
	if (Instruction *I = dyn_cast<Instruction>(AM.OriginalValue))
	BB = I->getParent();

	Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
	if (DV) {
	auto *Type = DV->getType();
	if (CommonType && CommonType != Type)
	return false;
	CommonType = Type;
	Map[{ AM.OriginalValue, BB }] = DV;
	} else {
	NullValue.push_back({ AM.OriginalValue, BB });
	}
	}
	assert(CommonType && "At least one non-null value must be!");
	for (auto VIBB : NullValue)
	Map[VIBB] = Constant::getNullValue(CommonType);
	return true;
	}

	/// \brief We have mapping between value A and basic block where value A
	/// seen to other value B where B was a field in addressing mode represented
	/// by A. Also we have an original value C representin an address in some
	/// basic block. Traversing from C through phi and selects we ended up with
	/// A's in a map. This utility function tries to find a value V which is a
	/// field in addressing mode C and traversing through phi nodes and selects
	/// we will end up in corresponded values B in a map.
	/// The utility will create a new Phi/Selects if needed.
	// The simple example looks as follows:
	// BB1:
	// p1 = b1 + 40
	// br cond BB2, BB3
	// BB2:
	// p2 = b2 + 40
	// br BB3
	// BB3:
	// p = phi [p1, BB1], [p2, BB2]
	// v = load p
	// Map is
	// <p1, BB1> -> b1
	// <p2, BB2> -> b2
	// Request is
	// <p, BB3> -> ?
	// The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3
	Value *findCommon(FoldAddrToValueMapping &Map) {
	// Tracks of new created Phi nodes.
	SmallPtrSet<PHINode *, 32> NewPhiNodes;
	// Tracks of new created Select nodes.
	SmallPtrSet<SelectInst *, 32> NewSelectNodes;
	// Tracks the simplification of new created phi nodes. The reason we use
	// this mapping is because we will add new created Phi nodes in AddrToBase.
	// Simplification of Phi nodes is recursive, so some Phi node may
	// be simplified after we added it to AddrToBase.
	// Using this mapping we can find the current value in AddrToBase.
	SimplificationTracker ST(SQ, NewPhiNodes, NewSelectNodes);

	// First step, DFS to create PHI nodes for all intermediate blocks.
	// Also fill traverse order for the second step.
	SmallVector<ValueInBB, 32> TraverseOrder;
	InsertPlaceholders(Map, TraverseOrder, NewPhiNodes, NewSelectNodes);

	// Second Step, fill new nodes by merged values and simplify if possible.
	FillPlaceholders(Map, TraverseOrder, ST);

	if (!AddrSinkNewSelects && NewSelectNodes.size() > 0) {
	DestroyNodes(NewPhiNodes);
	DestroyNodes(NewSelectNodes);
	return nullptr;
	}

	// Now we'd like to match New Phi nodes to existed ones.
	unsigned PhiNotMatchedCount = 0;
	if (!MatchPhiSet(NewPhiNodes, ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
	DestroyNodes(NewPhiNodes);
	DestroyNodes(NewSelectNodes);
	return nullptr;
	}

	auto *Result = ST.Get(Map.find(Original)->second);
	if (Result) {
	NumMemoryInstsPhiCreated += NewPhiNodes.size() + PhiNotMatchedCount;
	NumMemoryInstsSelectCreated += NewSelectNodes.size();
	}
	return Result;
	}

	/// \brief Destroy nodes from a set.
	template <typename T> void DestroyNodes(SmallPtrSetImpl<T *> &Instructions) {
	// For safe erasing, replace the Phi with dummy value first.
	auto Dummy = UndefValue::get(CommonType);
	for (auto I : Instructions) {
	I->replaceAllUsesWith(Dummy);
	I->eraseFromParent();
	}
	}

	/// \brief Try to match PHI node to Candidate.
	/// Matcher tracks the matched Phi nodes.
	bool MatchPhiNode(PHINode PHI, PHINode Candidate,
	DenseSet<PHIPair> &Matcher,
	SmallPtrSetImpl<PHINode *> &PhiNodesToMatch) {
	SmallVector<PHIPair, 8> WorkList;
	Matcher.insert({ PHI, Candidate });
	WorkList.push_back({ PHI, Candidate });
	SmallSet<PHIPair, 8> Visited;
	while (!WorkList.empty()) {
	auto Item = WorkList.pop_back_val();
	if (!Visited.insert(Item).second)
	continue;
	// We iterate over all incoming values to Phi to compare them.
	// If values are different and both of them Phi and the first one is a
	// Phi we added (subject to match) and both of them is in the same basic
	// block then we can match our pair if values match. So we state that
	// these values match and add it to work list to verify that.
	for (auto B : Item.first->blocks()) {
	Value *FirstValue = Item.first->getIncomingValueForBlock(B);
	Value *SecondValue = Item.second->getIncomingValueForBlock(B);
	if (FirstValue == SecondValue)
	continue;

	PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
	PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);

	// One of them is not Phi or
	// The first one is not Phi node from the set we'd like to match or
	// Phi nodes from different basic blocks then
	// we will not be able to match.
	if (!FirstPhi \|\| !SecondPhi \|\| !PhiNodesToMatch.count(FirstPhi) \|\|
	FirstPhi->getParent() != SecondPhi->getParent())
	return false;

	// If we already matched them then continue.
	if (Matcher.count({ FirstPhi, SecondPhi }))
	continue;
	// So the values are different and does not match. So we need them to
	// match.
	Matcher.insert({ FirstPhi, SecondPhi });
	// But me must check it.
	WorkList.push_back({ FirstPhi, SecondPhi });
	}
	}
	return true;
	}

	/// \brief For the given set of PHI nodes try to find their equivalents.
	/// Returns false if this matching fails and creation of new Phi is disabled.
	bool MatchPhiSet(SmallPtrSetImpl<PHINode *> &PhiNodesToMatch,
	SimplificationTracker &ST, bool AllowNewPhiNodes,
	unsigned &PhiNotMatchedCount) {
	DenseSet<PHIPair> Matched;
	SmallPtrSet<PHINode *, 8> WillNotMatch;
	while (PhiNodesToMatch.size()) {
	PHINode PHI = PhiNodesToMatch.begin();

	// Add us, if no Phi nodes in the basic block we do not match.
	WillNotMatch.clear();
	WillNotMatch.insert(PHI);

	// Traverse all Phis until we found equivalent or fail to do that.
	bool IsMatched = false;
	for (auto &P : PHI->getParent()->phis()) {
	if (&P == PHI)
	continue;
	if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
	break;
	// If it does not match, collect all Phi nodes from matcher.
	// if we end up with no match, them all these Phi nodes will not match
	// later.
	for (auto M : Matched)
	WillNotMatch.insert(M.first);
	Matched.clear();
	}
	if (IsMatched) {
	// Replace all matched values and erase them.
	for (auto MV : Matched) {
	MV.first->replaceAllUsesWith(MV.second);
	PhiNodesToMatch.erase(MV.first);
	ST.Put(MV.first, MV.second);
	MV.first->eraseFromParent();
	}
	Matched.clear();
	continue;
	}
	// If we are not allowed to create new nodes then bail out.
	if (!AllowNewPhiNodes)
	return false;
	// Just remove all seen values in matcher. They will not match anything.
	PhiNotMatchedCount += WillNotMatch.size();
	for (auto *P : WillNotMatch)
	PhiNodesToMatch.erase(P);
	}
	return true;
	}
	/// \brief Fill the placeholder with values from predecessors and simplify it.
	void FillPlaceholders(FoldAddrToValueMapping &Map,
	SmallVectorImpl<ValueInBB> &TraverseOrder,
	SimplificationTracker &ST) {
	while (!TraverseOrder.empty()) {
	auto Current = TraverseOrder.pop_back_val();
	assert(Map.find(Current) != Map.end() && "No node to fill!!!");
	Value *CurrentValue = Current.first;
	BasicBlock *CurrentBlock = Current.second;
	Value *V = Map[Current];

	if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
	// CurrentValue also must be Select.
	auto *CurrentSelect = cast<SelectInst>(CurrentValue);
	auto *TrueValue = CurrentSelect->getTrueValue();
	ValueInBB TrueItem = { TrueValue, isa<Instruction>(TrueValue)
	? CurrentBlock
	: nullptr };
	assert(Map.find(TrueItem) != Map.end() && "No True Value!");
	Select->setTrueValue(ST.Get(Map[TrueItem]));
	auto *FalseValue = CurrentSelect->getFalseValue();
	ValueInBB FalseItem = { FalseValue, isa<Instruction>(FalseValue)
	? CurrentBlock
	: nullptr };
	assert(Map.find(FalseItem) != Map.end() && "No False Value!");
	Select->setFalseValue(ST.Get(Map[FalseItem]));
	} else {
	// Must be a Phi node then.
	PHINode *PHI = cast<PHINode>(V);
	// Fill the Phi node with values from predecessors.
	bool IsDefinedInThisBB =
	cast<Instruction>(CurrentValue)->getParent() == CurrentBlock;
	auto *CurrentPhi = dyn_cast<PHINode>(CurrentValue);
	for (auto B : predecessors(CurrentBlock)) {
	Value *PV = IsDefinedInThisBB
	? CurrentPhi->getIncomingValueForBlock(B)
	: CurrentValue;
	ValueInBB item = { PV, isa<Instruction>(PV) ? B : nullptr };
	assert(Map.find(item) != Map.end() && "No predecessor Value!");
	PHI->addIncoming(ST.Get(Map[item]), B);
	}
	}
	// Simplify if possible.
	Map[Current] = ST.Simplify(V);
	}
	}

	/// Starting from value recursively iterates over predecessors up to known
	/// ending values represented in a map. For each traversed block inserts
	/// a placeholder Phi or Select.
	/// Reports all new created Phi/Select nodes by adding them to set.
	/// Also reports and order in what basic blocks have been traversed.
	void InsertPlaceholders(FoldAddrToValueMapping &Map,
	SmallVectorImpl<ValueInBB> &TraverseOrder,
	SmallPtrSetImpl<PHINode *> &NewPhiNodes,
	SmallPtrSetImpl<SelectInst *> &NewSelectNodes) {
	SmallVector<ValueInBB, 32> Worklist;
	assert((isa<PHINode>(Original.first) \|\| isa<SelectInst>(Original.first)) &&
	"Address must be a Phi or Select node");
	auto *Dummy = UndefValue::get(CommonType);
	Worklist.push_back(Original);
	while (!Worklist.empty()) {
	auto Current = Worklist.pop_back_val();
	// If value is not an instruction it is something global, constant,
	// parameter and we can say that this value is observable in any block.
	// Set block to null to denote it.
	// Also please take into account that it is how we build anchors.
	if (!isa<Instruction>(Current.first))
	Current.second = nullptr;
	// if it is already visited or it is an ending value then skip it.
	if (Map.find(Current) != Map.end())
	continue;
	TraverseOrder.push_back(Current);

	Value *CurrentValue = Current.first;
	BasicBlock *CurrentBlock = Current.second;
	// CurrentValue must be a Phi node or select. All others must be covered
	// by anchors.
	Instruction *CurrentI = cast<Instruction>(CurrentValue);
	bool IsDefinedInThisBB = CurrentI->getParent() == CurrentBlock;

	unsigned PredCount =
	std::distance(pred_begin(CurrentBlock), pred_end(CurrentBlock));
	// if Current Value is not defined in this basic block we are interested
	// in values in predecessors.
	if (!IsDefinedInThisBB) {
	assert(PredCount && "Unreachable block?!");
	PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
	&CurrentBlock->front());
	Map[Current] = PHI;
	NewPhiNodes.insert(PHI);
	// Add all predecessors in work list.
	for (auto B : predecessors(CurrentBlock))
	Worklist.push_back({ CurrentValue, B });
	continue;
	}
	// Value is defined in this basic block.
	if (SelectInst *OrigSelect = dyn_cast<SelectInst>(CurrentI)) {
	// Is it OK to get metadata from OrigSelect?!
	// Create a Select placeholder with dummy value.
	SelectInst *Select =
	SelectInst::Create(OrigSelect->getCondition(), Dummy, Dummy,
	OrigSelect->getName(), OrigSelect, OrigSelect);
	Map[Current] = Select;
	NewSelectNodes.insert(Select);
	// We are interested in True and False value in this basic block.
	Worklist.push_back({ OrigSelect->getTrueValue(), CurrentBlock });
	Worklist.push_back({ OrigSelect->getFalseValue(), CurrentBlock });
	} else {
	// It must be a Phi node then.
	auto *CurrentPhi = cast<PHINode>(CurrentI);
	// Create new Phi node for merge of bases.
	assert(PredCount && "Unreachable block?!");
	PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
	&CurrentBlock->front());
	Map[Current] = PHI;
	NewPhiNodes.insert(PHI);

	// Add all predecessors in work list.
	for (auto B : predecessors(CurrentBlock))
	Worklist.push_back({ CurrentPhi->getIncomingValueForBlock(B), B });
	}
	}
	}

	bool addrModeCombiningAllowed() {
	if (DisableComplexAddrModes)
	return false;
	switch (DifferentField) {
	default:
	return false;
	case ExtAddrMode::BaseRegField:
	return AddrSinkCombineBaseReg;
	case ExtAddrMode::BaseGVField:
	return AddrSinkCombineBaseGV;
	case ExtAddrMode::BaseOffsField:
	return AddrSinkCombineBaseOffs;
	case ExtAddrMode::ScaledRegField:
	return AddrSinkCombineScaledReg;
	}
	}
	};
	} // end anonymous namespace

	/// Try adding ScaleReg*Scale to the current addressing mode.
	/// Return true and update AddrMode if this addr mode is legal for the target,
	/// false if not.
	bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
	unsigned Depth) {
	// If Scale is 1, then this is the same as adding ScaleReg to the addressing
	// mode. Just process that directly.
	if (Scale == 1)
	return matchAddr(ScaleReg, Depth);

	// If the scale is 0, it takes nothing to add this.
	if (Scale == 0)
	return true;

	// If we already have a scale of this value, we can add to it, otherwise, we
	// need an available scale field.
	if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
	return false;

	ExtAddrMode TestAddrMode = AddrMode;

	// Add scale to turn X4+X3 -> X*7. This could also do things like
	// [A+B + A7] -> [B+A8].
	TestAddrMode.Scale += Scale;
	TestAddrMode.ScaledReg = ScaleReg;

	// If the new address isn't legal, bail out.
	if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
	return false;

	// It was legal, so commit it.
	AddrMode = TestAddrMode;

	// Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now
	// to see if ScaleReg is actually X+C. If so, we can turn this into adding
	// XScale + CScale to addr mode.
	ConstantInt CI = nullptr; Value AddLHS = nullptr;
	if (isa<Instruction>(ScaleReg) && // not a constant expr.
	match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
	TestAddrMode.ScaledReg = AddLHS;
	TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;

	// If this addressing mode is legal, commit it and remember that we folded
	// this instruction.
	if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
	AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
	AddrMode = TestAddrMode;
	return true;
	}
	}

	// Otherwise, not (x+c)*scale, just return what we have.
	return true;
	}

	/// This is a little filter, which returns true if an addressing computation
	/// involving I might be folded into a load/store accessing it.
	/// This doesn't need to be perfect, but needs to accept at least
	/// the set of instructions that MatchOperationAddr can.
	static bool MightBeFoldableInst(Instruction *I) {
	switch (I->getOpcode()) {
	case Instruction::BitCast:
	case Instruction::AddrSpaceCast:
	// Don't touch identity bitcasts.
	if (I->getType() == I->getOperand(0)->getType())
	return false;
	return I->getType()->isPointerTy() \|\| I->getType()->isIntegerTy();
	case Instruction::PtrToInt:
	// PtrToInt is always a noop, as we know that the int type is pointer sized.
	return true;
	case Instruction::IntToPtr:
	// We know the input is intptr_t, so this is foldable.
	return true;
	case Instruction::Add:
	return true;
	case Instruction::Mul:
	case Instruction::Shl:
	// Can only handle X*C and X << C.
	return isa<ConstantInt>(I->getOperand(1));
	case Instruction::GetElementPtr:
	return true;
	default:
	return false;
	}
	}

	/// \brief Check whether or not \p Val is a legal instruction for \p TLI.
	/// \note \p Val is assumed to be the product of some type promotion.
	/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
	/// to be legal, as the non-promoted value would have had the same state.
	static bool isPromotedInstructionLegal(const TargetLowering &TLI,
	const DataLayout &DL, Value *Val) {
	Instruction *PromotedInst = dyn_cast<Instruction>(Val);
	if (!PromotedInst)
	return false;
	int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
	// If the ISDOpcode is undefined, it was undefined before the promotion.
	if (!ISDOpcode)
	return true;
	// Otherwise, check if the promoted instruction is legal or not.
	return TLI.isOperationLegalOrCustom(
	ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
	}

	namespace {

	/// \brief Hepler class to perform type promotion.
	class TypePromotionHelper {
	/// \brief Utility function to check whether or not a sign or zero extension
	/// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
	/// either using the operands of \p Inst or promoting \p Inst.
	/// The type of the extension is defined by \p IsSExt.
	/// In other words, check if:
	/// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
	/// #1 Promotion applies:
	/// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
	/// #2 Operand reuses:
	/// ext opnd1 to ConsideredExtType.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	static bool canGetThrough(const Instruction Inst, Type ConsideredExtType,
	const InstrToOrigTy &PromotedInsts, bool IsSExt);

	/// \brief Utility function to determine if \p OpIdx should be promoted when
	/// promoting \p Inst.
	static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
	return !(isa<SelectInst>(Inst) && OpIdx == 0);
	}

	/// \brief Utility function to promote the operand of \p Ext when this
	/// operand is a promotable trunc or sext or zext.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	/// \p CreatedInstsCost[out] contains the cost of all instructions
	/// created to promote the operand of Ext.
	/// Newly added extensions are inserted in \p Exts.
	/// Newly added truncates are inserted in \p Truncs.
	/// Should never be called directly.
	/// \return The promoted value which is used instead of Ext.
	static Value *promoteOperandForTruncAndAnyExt(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI);

	/// \brief Utility function to promote the operand of \p Ext when this
	/// operand is promotable and is not a supported trunc or sext.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	/// \p CreatedInstsCost[out] contains the cost of all the instructions
	/// created to promote the operand of Ext.
	/// Newly added extensions are inserted in \p Exts.
	/// Newly added truncates are inserted in \p Truncs.
	/// Should never be called directly.
	/// \return The promoted value which is used instead of Ext.
	static Value promoteOperandForOther(Instruction Ext,
	TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts,
	unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs,
	const TargetLowering &TLI, bool IsSExt);

	/// \see promoteOperandForOther.
	static Value *signExtendOperandForOther(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI) {
	return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
	Exts, Truncs, TLI, true);
	}

	/// \see promoteOperandForOther.
	static Value *zeroExtendOperandForOther(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI) {
	return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
	Exts, Truncs, TLI, false);
	}

	public:
	/// Type for the utility function that promotes the operand of Ext.
	using Action = Value ()(Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts,
	unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs,
	const TargetLowering &TLI);

	/// \brief Given a sign/zero extend instruction \p Ext, return the approriate
	/// action to promote the operand of \p Ext instead of using Ext.
	/// \return NULL if no promotable action is possible with the current
	/// sign extension.
	/// \p InsertedInsts keeps track of all the instructions inserted by the
	/// other CodeGenPrepare optimizations. This information is important
	/// because we do not want to promote these instructions as CodeGenPrepare
	/// will reinsert them later. Thus creating an infinite loop: create/remove.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
	const TargetLowering &TLI,
	const InstrToOrigTy &PromotedInsts);
	};

	} // end anonymous namespace

	bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
	Type *ConsideredExtType,
	const InstrToOrigTy &PromotedInsts,
	bool IsSExt) {
	// The promotion helper does not know how to deal with vector types yet.
	// To be able to fix that, we would need to fix the places where we
	// statically extend, e.g., constants and such.
	if (Inst->getType()->isVectorTy())
	return false;

	// We can always get through zext.
	if (isa<ZExtInst>(Inst))
	return true;

	// sext(sext) is ok too.
	if (IsSExt && isa<SExtInst>(Inst))
	return true;

	// We can get through binary operator, if it is legal. In other words, the
	// binary operator must have a nuw or nsw flag.
	const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
	if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
	((!IsSExt && BinOp->hasNoUnsignedWrap()) \|\|
	(IsSExt && BinOp->hasNoSignedWrap())))
	return true;

	// Check if we can do the following simplification.
	// ext(trunc(opnd)) --> ext(opnd)
	if (!isa<TruncInst>(Inst))
	return false;

	Value *OpndVal = Inst->getOperand(0);
	// Check if we can use this operand in the extension.
	// If the type is larger than the result type of the extension, we cannot.
	if (!OpndVal->getType()->isIntegerTy() \|\|
	OpndVal->getType()->getIntegerBitWidth() >
	ConsideredExtType->getIntegerBitWidth())
	return false;

	// If the operand of the truncate is not an instruction, we will not have
	// any information on the dropped bits.
	// (Actually we could for constant but it is not worth the extra logic).
	Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
	if (!Opnd)
	return false;

	// Check if the source of the type is narrow enough.
	// I.e., check that trunc just drops extended bits of the same kind of
	// the extension.
	// #1 get the type of the operand and check the kind of the extended bits.
	const Type *OpndType;
	InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
	if (It != PromotedInsts.end() && It->second.getInt() == IsSExt)
	OpndType = It->second.getPointer();
	else if ((IsSExt && isa<SExtInst>(Opnd)) \|\| (!IsSExt && isa<ZExtInst>(Opnd)))
	OpndType = Opnd->getOperand(0)->getType();
	else
	return false;

	// #2 check that the truncate just drops extended bits.
	return Inst->getType()->getIntegerBitWidth() >=
	OpndType->getIntegerBitWidth();
	}

	TypePromotionHelper::Action TypePromotionHelper::getAction(
	Instruction *Ext, const SetOfInstrs &InsertedInsts,
	const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
	assert((isa<SExtInst>(Ext) \|\| isa<ZExtInst>(Ext)) &&
	"Unexpected instruction type");
	Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
	Type *ExtTy = Ext->getType();
	bool IsSExt = isa<SExtInst>(Ext);
	// If the operand of the extension is not an instruction, we cannot
	// get through.
	// If it, check we can get through.
	if (!ExtOpnd \|\| !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
	return nullptr;

	// Do not promote if the operand has been added by codegenprepare.
	// Otherwise, it means we are undoing an optimization that is likely to be
	// redone, thus causing potential infinite loop.
	if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
	return nullptr;

	// SExt or Trunc instructions.
	// Return the related handler.
	if (isa<SExtInst>(ExtOpnd) \|\| isa<TruncInst>(ExtOpnd) \|\|
	isa<ZExtInst>(ExtOpnd))
	return promoteOperandForTruncAndAnyExt;

	// Regular instruction.
	// Abort early if we will have to insert non-free instructions.
	if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
	return nullptr;
	return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
	}

	Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
	Instruction *SExt, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI) {
	// By construction, the operand of SExt is an instruction. Otherwise we cannot
	// get through it and this method should not be called.
	Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
	Value *ExtVal = SExt;
	bool HasMergedNonFreeExt = false;
	if (isa<ZExtInst>(SExtOpnd)) {
	// Replace s\|zext(zext(opnd))
	// => zext(opnd).
	HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
	Value *ZExt =
	TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
	TPT.replaceAllUsesWith(SExt, ZExt);
	TPT.eraseInstruction(SExt);
	ExtVal = ZExt;
	} else {
	// Replace z\|sext(trunc(opnd)) or sext(sext(opnd))
	// => z\|sext(opnd).
	TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
	}
	CreatedInstsCost = 0;

	// Remove dead code.
	if (SExtOpnd->use_empty())
	TPT.eraseInstruction(SExtOpnd);

	// Check if the extension is still needed.
	Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
	if (!ExtInst \|\| ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
	if (ExtInst) {
	if (Exts)
	Exts->push_back(ExtInst);
	CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
	}
	return ExtVal;
	}

	// At this point we have: ext ty opnd to ty.
	// Reassign the uses of ExtInst to the opnd and remove ExtInst.
	Value *NextVal = ExtInst->getOperand(0);
	TPT.eraseInstruction(ExtInst, NextVal);
	return NextVal;
	}

	Value *TypePromotionHelper::promoteOperandForOther(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI,
	bool IsSExt) {
	// By construction, the operand of Ext is an instruction. Otherwise we cannot
	// get through it and this method should not be called.
	Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
	CreatedInstsCost = 0;
	if (!ExtOpnd->hasOneUse()) {
	// ExtOpnd will be promoted.
	// All its uses, but Ext, will need to use a truncated value of the
	// promoted version.
	// Create the truncate now.
	Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
	if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
	// Insert it just after the definition.
	ITrunc->moveAfter(ExtOpnd);
	if (Truncs)
	Truncs->push_back(ITrunc);
	}

	TPT.replaceAllUsesWith(ExtOpnd, Trunc);
	// Restore the operand of Ext (which has been replaced by the previous call
	// to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
	TPT.setOperand(Ext, 0, ExtOpnd);
	}

	// Get through the Instruction:
	// 1. Update its type.
	// 2. Replace the uses of Ext by Inst.
	// 3. Extend each operand that needs to be extended.

	// Remember the original type of the instruction before promotion.
	// This is useful to know that the high bits are sign extended bits.
	PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>(
	ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt)));
	// Step #1.
	TPT.mutateType(ExtOpnd, Ext->getType());
	// Step #2.
	TPT.replaceAllUsesWith(Ext, ExtOpnd);
	// Step #3.
	Instruction *ExtForOpnd = Ext;

	DEBUG(dbgs() << "Propagate Ext to operands\n");
	for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
	++OpIdx) {
	DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
	if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() \|\|
	!shouldExtOperand(ExtOpnd, OpIdx)) {
	DEBUG(dbgs() << "No need to propagate\n");
	continue;
	}
	// Check if we can statically extend the operand.
	Value *Opnd = ExtOpnd->getOperand(OpIdx);
	if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
	DEBUG(dbgs() << "Statically extend\n");
	unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
	APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
	: Cst->getValue().zext(BitWidth);
	TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
	continue;
	}
	// UndefValue are typed, so we have to statically sign extend them.
	if (isa<UndefValue>(Opnd)) {
	DEBUG(dbgs() << "Statically extend\n");
	TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
	continue;
	}

	// Otherwise we have to explicity sign extend the operand.
	// Check if Ext was reused to extend an operand.
	if (!ExtForOpnd) {
	// If yes, create a new one.
	DEBUG(dbgs() << "More operands to ext\n");
	Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
	: TPT.createZExt(Ext, Opnd, Ext->getType());
	if (!isa<Instruction>(ValForExtOpnd)) {
	TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
	continue;
	}
	ExtForOpnd = cast<Instruction>(ValForExtOpnd);
	}
	if (Exts)
	Exts->push_back(ExtForOpnd);
	TPT.setOperand(ExtForOpnd, 0, Opnd);

	// Move the sign extension before the insertion point.
	TPT.moveBefore(ExtForOpnd, ExtOpnd);
	TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
	CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
	// If more sext are required, new instructions will have to be created.
	ExtForOpnd = nullptr;
	}
	if (ExtForOpnd == Ext) {
	DEBUG(dbgs() << "Extension is useless now\n");
	TPT.eraseInstruction(Ext);
	}
	return ExtOpnd;
	}

	/// Check whether or not promoting an instruction to a wider type is profitable.
	/// \p NewCost gives the cost of extension instructions created by the
	/// promotion.
	/// \p OldCost gives the cost of extension instructions before the promotion
	/// plus the number of instructions that have been
	/// matched in the addressing mode the promotion.
	/// \p PromotedOperand is the value that has been promoted.
	/// \return True if the promotion is profitable, false otherwise.
	bool AddressingModeMatcher::isPromotionProfitable(
	unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
	DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
	// The cost of the new extensions is greater than the cost of the
	// old extension plus what we folded.
	// This is not profitable.
	if (NewCost > OldCost)
	return false;
	if (NewCost < OldCost)
	return true;
	// The promotion is neutral but it may help folding the sign extension in
	// loads for instance.
	// Check that we did not create an illegal instruction.
	return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
	}

	/// Given an instruction or constant expr, see if we can fold the operation
	/// into the addressing mode. If so, update the addressing mode and return
	/// true, otherwise return false without modifying AddrMode.
	/// If \p MovedAway is not NULL, it contains the information of whether or
	/// not AddrInst has to be folded into the addressing mode on success.
	/// If \p MovedAway == true, \p AddrInst will not be part of the addressing
	/// because it has been moved away.
	/// Thus AddrInst must not be added in the matched instructions.
	/// This state can happen when AddrInst is a sext, since it may be moved away.
	/// Therefore, AddrInst may not be valid when MovedAway is true and it must
	/// not be referenced anymore.
	bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
	unsigned Depth,
	bool *MovedAway) {
	// Avoid exponential behavior on extremely deep expression trees.
	if (Depth >= 5) return false;

	// By default, all matched instructions stay in place.
	if (MovedAway)
	*MovedAway = false;

	switch (Opcode) {
	case Instruction::PtrToInt:
	// PtrToInt is always a noop, as we know that the int type is pointer sized.
	return matchAddr(AddrInst->getOperand(0), Depth);
	case Instruction::IntToPtr: {
	auto AS = AddrInst->getType()->getPointerAddressSpace();
	auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
	// This inttoptr is a no-op if the integer type is pointer sized.
	if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
	return matchAddr(AddrInst->getOperand(0), Depth);
	return false;
	}
	case Instruction::BitCast:
	// BitCast is always a noop, and we can handle it as long as it is
	// int->int or pointer->pointer (we don't want int<->fp or something).
	if ((AddrInst->getOperand(0)->getType()->isPointerTy() \|\|
	AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
	// Don't touch identity bitcasts. These were probably put here by LSR,
	// and we don't want to mess around with them. Assume it knows what it
	// is doing.
	AddrInst->getOperand(0)->getType() != AddrInst->getType())
	return matchAddr(AddrInst->getOperand(0), Depth);
	return false;
	case Instruction::AddrSpaceCast: {
	unsigned SrcAS
	= AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
	unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
	if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
	return matchAddr(AddrInst->getOperand(0), Depth);
	return false;
	}
	case Instruction::Add: {
	// Check to see if we can merge in the RHS then the LHS. If so, we win.
	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();
	// Start a transaction at this point.
	// The LHS may match but not the RHS.
	// Therefore, we need a higher level restoration point to undo partially
	// matched operation.
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();

	if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
	matchAddr(AddrInst->getOperand(0), Depth+1))
	return true;

	// Restore the old addr mode info.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	TPT.rollback(LastKnownGood);

	// Otherwise this was over-aggressive. Try merging in the LHS then the RHS.
	if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
	matchAddr(AddrInst->getOperand(1), Depth+1))
	return true;

	// Otherwise we definitely can't merge the ADD in.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	TPT.rollback(LastKnownGood);
	break;
	}
	//case Instruction::Or:
	// TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
	//break;
	case Instruction::Mul:
	case Instruction::Shl: {
	// Can only handle X*C and X << C.
	ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
	if (!RHS \|\| RHS->getBitWidth() > 64)
	return false;
	int64_t Scale = RHS->getSExtValue();
	if (Opcode == Instruction::Shl)
	Scale = 1LL << Scale;

	return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
	}
	case Instruction::GetElementPtr: {
	// Scan the GEP. We check it if it contains constant offsets and at most
	// one variable offset.
	int VariableOperand = -1;
	unsigned VariableScale = 0;

	int64_t ConstantOffset = 0;
	gep_type_iterator GTI = gep_type_begin(AddrInst);
	for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	const StructLayout *SL = DL.getStructLayout(STy);
	unsigned Idx =
	cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
	ConstantOffset += SL->getElementOffset(Idx);
	} else {
	uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
	if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
	ConstantOffset += CI->getSExtValue() * TypeSize;
	} else if (TypeSize) { // Scales of zero don't do anything.
	// We only allow one variable index at the moment.
	if (VariableOperand != -1)
	return false;

	// Remember the variable index.
	VariableOperand = i;
	VariableScale = TypeSize;
	}
	}
	}

	// A common case is for the GEP to only do a constant offset. In this case,
	// just add it to the disp field and check validity.
	if (VariableOperand == -1) {
	AddrMode.BaseOffs += ConstantOffset;
	if (ConstantOffset == 0 \|\|
	TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
	// Check to see if we can fold the base pointer in too.
	if (matchAddr(AddrInst->getOperand(0), Depth+1))
	return true;
	}
	AddrMode.BaseOffs -= ConstantOffset;
	return false;
	}

	// Save the valid addressing mode in case we can't match.
	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();

	// See if the scale and offset amount is valid for this target.
	AddrMode.BaseOffs += ConstantOffset;

	// Match the base operand of the GEP.
	if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
	// If it couldn't be matched, just stuff the value in a register.
	if (AddrMode.HasBaseReg) {
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	return false;
	}
	AddrMode.HasBaseReg = true;
	AddrMode.BaseReg = AddrInst->getOperand(0);
	}

	// Match the remaining variable portion of the GEP.
	if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
	Depth)) {
	// If it couldn't be matched, try stuffing the base into a register
	// instead of matching it, and retrying the match of the scale.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	if (AddrMode.HasBaseReg)
	return false;
	AddrMode.HasBaseReg = true;
	AddrMode.BaseReg = AddrInst->getOperand(0);
	AddrMode.BaseOffs += ConstantOffset;
	if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
	VariableScale, Depth)) {
	// If even that didn't work, bail.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	return false;
	}
	}

	return true;
	}
	case Instruction::SExt:
	case Instruction::ZExt: {
	Instruction *Ext = dyn_cast<Instruction>(AddrInst);
	if (!Ext)
	return false;

	// Try to move this ext out of the way of the addressing mode.
	// Ask for a method for doing so.
	TypePromotionHelper::Action TPH =
	TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
	if (!TPH)
	return false;

	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	unsigned CreatedInstsCost = 0;
	unsigned ExtCost = !TLI.isExtFree(Ext);
	Value *PromotedOperand =
	TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
	// SExt has been moved away.
	// Thus either it will be rematched later in the recursive calls or it is
	// gone. Anyway, we must not fold it into the addressing mode at this point.
	// E.g.,
	// op = add opnd, 1
	// idx = ext op
	// addr = gep base, idx
	// is now:
	// promotedOpnd = ext opnd <- no match here
	// op = promoted_add promotedOpnd, 1 <- match (later in recursive calls)
	// addr = gep base, op <- match
	if (MovedAway)
	*MovedAway = true;

	assert(PromotedOperand &&
	"TypePromotionHelper should have filtered out those cases");

	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();

	if (!matchAddr(PromotedOperand, Depth) \|\|
	// The total of the new cost is equal to the cost of the created
	// instructions.
	// The total of the old cost is equal to the cost of the extension plus
	// what we have saved in the addressing mode.
	!isPromotionProfitable(CreatedInstsCost,
	ExtCost + (AddrModeInsts.size() - OldSize),
	PromotedOperand)) {
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
	TPT.rollback(LastKnownGood);
	return false;
	}
	return true;
	}
	}
	return false;
	}

	/// If we can, try to add the value of 'Addr' into the current addressing mode.
	/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
	/// unmodified. This assumes that Addr is either a pointer type or intptr_t
	/// for the target.
	///
	bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
	// Start a transaction at this point that we will rollback if the matching
	// fails.
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
	// Fold in immediates if legal for the target.
	AddrMode.BaseOffs += CI->getSExtValue();
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.BaseOffs -= CI->getSExtValue();
	} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
	// If this is a global variable, try to fold it into the addressing mode.
	if (!AddrMode.BaseGV) {
	AddrMode.BaseGV = GV;
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.BaseGV = nullptr;
	}
	} else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();

	// Check to see if it is possible to fold this operation.
	bool MovedAway = false;
	if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
	// This instruction may have been moved away. If so, there is nothing
	// to check here.
	if (MovedAway)
	return true;
	// Okay, it's possible to fold this. Check to see if it is actually
	// profitable to do so. We use a simple cost model to avoid increasing
	// register pressure too much.
	if (I->hasOneUse() \|\|
	isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
	AddrModeInsts.push_back(I);
	return true;
	}

	// It isn't profitable to do this, roll back.
	//cerr << "NOT FOLDING: " << *I;
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	TPT.rollback(LastKnownGood);
	}
	} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
	if (matchOperationAddr(CE, CE->getOpcode(), Depth))
	return true;
	TPT.rollback(LastKnownGood);
	} else if (isa<ConstantPointerNull>(Addr)) {
	// Null pointer gets folded without affecting the addressing mode.
	return true;
	}

	// Worse case, the target should support [reg] addressing modes. :)
	if (!AddrMode.HasBaseReg) {
	AddrMode.HasBaseReg = true;
	AddrMode.BaseReg = Addr;
	// Still check for legality in case the target supports [imm] but not [i+r].
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.HasBaseReg = false;
	AddrMode.BaseReg = nullptr;
	}

	// If the base register is already taken, see if we can do [r+r].
	if (AddrMode.Scale == 0) {
	AddrMode.Scale = 1;
	AddrMode.ScaledReg = Addr;
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.Scale = 0;
	AddrMode.ScaledReg = nullptr;
	}
	// Couldn't match.
	TPT.rollback(LastKnownGood);
	return false;
	}

	/// Check to see if all uses of OpVal by the specified inline asm call are due
	/// to memory operands. If so, return true, otherwise return false.
	static bool IsOperandAMemoryOperand(CallInst CI, InlineAsm IA, Value *OpVal,
	const TargetLowering &TLI,
	const TargetRegisterInfo &TRI) {
	const Function *F = CI->getFunction();
	TargetLowering::AsmOperandInfoVector TargetConstraints =
	TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
	ImmutableCallSite(CI));

	for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
	TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];

	// Compute the constraint code and ConstraintType to use.
	TLI.ComputeConstraintToUse(OpInfo, SDValue());

	// If this asm operand is our Value*, and if it isn't an indirect memory
	// operand, we can't fold it!
	if (OpInfo.CallOperandVal == OpVal &&
	(OpInfo.ConstraintType != TargetLowering::C_Memory \|\|
	!OpInfo.isIndirect))
	return false;
	}

	return true;
	}

	// Max number of memory uses to look at before aborting the search to conserve
	// compile time.
	static constexpr int MaxMemoryUsesToScan = 20;

	/// Recursively walk all the uses of I until we find a memory use.
	/// If we find an obviously non-foldable instruction, return true.
	/// Add the ultimately found memory instructions to MemoryUses.
	static bool FindAllMemoryUses(
	Instruction *I,
	SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
	SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
	const TargetRegisterInfo &TRI, int SeenInsts = 0) {
	// If we already considered this instruction, we're done.
	if (!ConsideredInsts.insert(I).second)
	return false;

	// If this is an obviously unfoldable instruction, bail out.
	if (!MightBeFoldableInst(I))
	return true;

	const bool OptSize = I->getFunction()->optForSize();

	// Loop over all the uses, recursively processing them.
	for (Use &U : I->uses()) {
	// Conservatively return true if we're seeing a large number or a deep chain
	// of users. This avoids excessive compilation times in pathological cases.
	if (SeenInsts++ >= MaxMemoryUsesToScan)
	return true;

	Instruction *UserI = cast<Instruction>(U.getUser());
	if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
	MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
	continue;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
	unsigned opNo = U.getOperandNo();
	if (opNo != StoreInst::getPointerOperandIndex())
	return true; // Storing addr, not into addr.
	MemoryUses.push_back(std::make_pair(SI, opNo));
	continue;
	}

	if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
	unsigned opNo = U.getOperandNo();
	if (opNo != AtomicRMWInst::getPointerOperandIndex())
	return true; // Storing addr, not into addr.
	MemoryUses.push_back(std::make_pair(RMW, opNo));
	continue;
	}

	if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
	unsigned opNo = U.getOperandNo();
	if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
	return true; // Storing addr, not into addr.
	MemoryUses.push_back(std::make_pair(CmpX, opNo));
	continue;
	}

	if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
	// If this is a cold call, we can sink the addressing calculation into
	// the cold path. See optimizeCallInst
	if (!OptSize && CI->hasFnAttr(Attribute::Cold))
	continue;

	InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
	if (!IA) return true;

	// If this is a memory operand, we're cool, otherwise bail out.
	if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
	return true;
	continue;
	}

	if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI,
	SeenInsts))
	return true;
	}

	return false;
	}

	/// Return true if Val is already known to be live at the use site that we're
	/// folding it into. If so, there is no cost to include it in the addressing
	/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
	/// instruction already.
	bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value Val,Value KnownLive1,
	Value *KnownLive2) {
	// If Val is either of the known-live values, we know it is live!
	if (Val == nullptr \|\| Val == KnownLive1 \|\| Val == KnownLive2)
	return true;

	// All values other than instructions and arguments (e.g. constants) are live.
	if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;

	// If Val is a constant sized alloca in the entry block, it is live, this is
	// true because it is just a reference to the stack/frame pointer, which is
	// live for the whole function.
	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
	if (AI->isStaticAlloca())
	return true;

	// Check to see if this value is already used in the memory instruction's
	// block. If so, it's already live into the block at the very least, so we
	// can reasonably fold it.
	return Val->isUsedInBasicBlock(MemoryInst->getParent());
	}

	/// It is possible for the addressing mode of the machine to fold the specified
	/// instruction into a load or store that ultimately uses it.
	/// However, the specified instruction has multiple uses.
	/// Given this, it may actually increase register pressure to fold it
	/// into the load. For example, consider this code:
	///
	/// X = ...
	/// Y = X+1
	/// use(Y) -> nonload/store
	/// Z = Y+1
	/// load Z
	///
	/// In this case, Y has multiple uses, and can be folded into the load of Z
	/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to
	/// be live at the use(Y) line. If we don't fold Y into load Z, we use one
	/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the
	/// number of computations either.
	///
	/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If
	/// X was live across 'load Z' for other reasons, we actually would want to
	/// fold the addressing mode in the Z case. This would make Y die earlier.
	bool AddressingModeMatcher::
	isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
	ExtAddrMode &AMAfter) {
	if (IgnoreProfitability) return true;

	// AMBefore is the addressing mode before this instruction was folded into it,
	// and AMAfter is the addressing mode after the instruction was folded. Get
	// the set of registers referenced by AMAfter and subtract out those
	// referenced by AMBefore: this is the set of values which folding in this
	// address extends the lifetime of.
	//
	// Note that there are only two potential values being referenced here,
	// BaseReg and ScaleReg (global addresses are always available, as are any
	// folded immediates).
	Value BaseReg = AMAfter.BaseReg, ScaledReg = AMAfter.ScaledReg;

	// If the BaseReg or ScaledReg was referenced by the previous addrmode, their
	// lifetime wasn't extended by adding this instruction.
	if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
	BaseReg = nullptr;
	if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
	ScaledReg = nullptr;

	// If folding this instruction (and it's subexprs) didn't extend any live
	// ranges, we're ok with it.
	if (!BaseReg && !ScaledReg)
	return true;

	// If all uses of this instruction can have the address mode sunk into them,
	// we can remove the addressing mode and effectively trade one live register
	// for another (at worst.) In this context, folding an addressing mode into
	// the use is just a particularly nice way of sinking it.
	SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
	SmallPtrSet<Instruction*, 16> ConsideredInsts;
	if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
	return false; // Has a non-memory, non-foldable use!

	// Now that we know that all uses of this instruction are part of a chain of
	// computation involving only operations that could theoretically be folded
	// into a memory use, loop over each of these memory operation uses and see
	// if they could actually fold the instruction. The assumption is that
	// addressing modes are cheap and that duplicating the computation involved
	// many times is worthwhile, even on a fastpath. For sinking candidates
	// (i.e. cold call sites), this serves as a way to prevent excessive code
	// growth since most architectures have some reasonable small and fast way to
	// compute an effective address. (i.e LEA on x86)
	SmallVector<Instruction*, 32> MatchedAddrModeInsts;
	for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
	Instruction *User = MemoryUses[i].first;
	unsigned OpNo = MemoryUses[i].second;

	// Get the access type of this use. If the use isn't a pointer, we don't
	// know what it accesses.
	Value *Address = User->getOperand(OpNo);
	PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
	if (!AddrTy)
	return false;
	Type *AddressAccessTy = AddrTy->getElementType();
	unsigned AS = AddrTy->getAddressSpace();

	// Do a match against the root of this address, ignoring profitability. This
	// will tell us if the addressing mode for the memory operation will
	// actually cover the shared instruction.
	ExtAddrMode Result;
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI,
	AddressAccessTy, AS,
	MemoryInst, Result, InsertedInsts,
	PromotedInsts, TPT);
	Matcher.IgnoreProfitability = true;
	bool Success = Matcher.matchAddr(Address, 0);
	(void)Success; assert(Success && "Couldn't select anything?");

	// The match was to check the profitability, the changes made are not
	// part of the original matcher. Therefore, they should be dropped
	// otherwise the original matcher will not present the right state.
	TPT.rollback(LastKnownGood);

	// If the match didn't cover I, then it won't be shared by it.
	if (!is_contained(MatchedAddrModeInsts, I))
	return false;

	MatchedAddrModeInsts.clear();
	}

	return true;
	}

	/// Return true if the specified values are defined in a
	/// different basic block than BB.
	static bool IsNonLocalValue(Value V, BasicBlock BB) {
	if (Instruction *I = dyn_cast<Instruction>(V))
	return I->getParent() != BB;
	return false;
	}

	/// Sink addressing mode computation immediate before MemoryInst if doing so
	/// can be done without increasing register pressure. The need for the
	/// register pressure constraint means this can end up being an all or nothing
	/// decision for all uses of the same addressing computation.
	///
	/// Load and Store Instructions often have addressing modes that can do
	/// significant amounts of computation. As such, instruction selection will try
	/// to get the load or store to do as much computation as possible for the
	/// program. The problem is that isel can only see within a single block. As
	/// such, we sink as much legal addressing mode work into the block as possible.
	///
	/// This method is used to optimize both load/store and inline asms with memory
	/// operands. It's also used to sink addressing computations feeding into cold
	/// call sites into their (cold) basic block.
	///
	/// The motivation for handling sinking into cold blocks is that doing so can
	/// both enable other address mode sinking (by satisfying the register pressure
	/// constraint above), and reduce register pressure globally (by removing the
	/// addressing mode computation from the fast path entirely.).
	bool CodeGenPrepare::optimizeMemoryInst(Instruction MemoryInst, Value Addr,
	Type *AccessTy, unsigned AddrSpace) {
	Value *Repl = Addr;

	// Try to collapse single-value PHI nodes. This is necessary to undo
	// unprofitable PRE transformations.
	SmallVector<Value*, 8> worklist;
	SmallPtrSet<Value*, 16> Visited;
	worklist.push_back(Addr);

	// Use a worklist to iteratively look through PHI and select nodes, and
	// ensure that the addressing mode obtained from the non-PHI/select roots of
	// the graph are compatible.
	bool PhiOrSelectSeen = false;
	SmallVector<Instruction*, 16> AddrModeInsts;
	const SimplifyQuery SQ(*DL, TLInfo);
	AddressingModeCombiner AddrModes(SQ, { Addr, MemoryInst->getParent() });
	TypePromotionTransaction TPT(RemovedInsts);
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	while (!worklist.empty()) {
	Value *V = worklist.back();
	worklist.pop_back();

	// We allow traversing cyclic Phi nodes.
	// In case of success after this loop we ensure that traversing through
	// Phi nodes ends up with all cases to compute address of the form
	// BaseGV + Base + Scale * Index + Offset
	// where Scale and Offset are constans and BaseGV, Base and Index
	// are exactly the same Values in all cases.
	// It means that BaseGV, Scale and Offset dominate our memory instruction
	// and have the same value as they had in address computation represented
	// as Phi. So we can safely sink address computation to memory instruction.
	if (!Visited.insert(V).second)
	continue;

	// For a PHI node, push all of its incoming values.
	if (PHINode *P = dyn_cast<PHINode>(V)) {
	for (Value *IncValue : P->incoming_values())
	worklist.push_back(IncValue);
	PhiOrSelectSeen = true;
	continue;
	}
	// Similar for select.
	if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
	worklist.push_back(SI->getFalseValue());
	worklist.push_back(SI->getTrueValue());
	PhiOrSelectSeen = true;
	continue;
	}

	// For non-PHIs, determine the addressing mode being computed. Note that
	// the result may differ depending on what other uses our candidate
	// addressing instructions might have.
	AddrModeInsts.clear();
	ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
	V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, TLI, TRI,
	InsertedInsts, PromotedInsts, TPT);
	NewAddrMode.OriginalValue = V;

	if (!AddrModes.addNewAddrMode(NewAddrMode))
	break;
	}

	// Try to combine the AddrModes we've collected. If we couldn't collect any,
	// or we have multiple but either couldn't combine them or combining them
	// wouldn't do anything useful, bail out now.
	if (!AddrModes.combineAddrModes()) {
	TPT.rollback(LastKnownGood);
	return false;
	}
	TPT.commit();

	// Get the combined AddrMode (or the only AddrMode, if we only had one).
	ExtAddrMode AddrMode = AddrModes.getAddrMode();

	// If all the instructions matched are already in this BB, don't do anything.
	// If we saw a Phi node then it is not local definitely, and if we saw a select
	// then we want to push the address calculation past it even if it's already
	// in this BB.
	if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
	return IsNonLocalValue(V, MemoryInst->getParent());
	})) {
	DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n");
	return false;
	}

	// Insert this computation right after this user. Since our caller is
	// scanning from the top of the BB to the bottom, reuse of the expr are
	// guaranteed to happen later.
	IRBuilder<> Builder(MemoryInst);

	// Now that we determined the addressing expression we want to use and know
	// that we have to sink it into this block. Check to see if we have already
	// done this for some other load/store instr in this block. If so, reuse
	// the computation. Before attempting reuse, check if the address is valid
	// as it may have been erased.

	WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];

	Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
	if (SunkAddr) {
	DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
	<< *MemoryInst << "\n");
	if (SunkAddr->getType() != Addr->getType())
	SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
	} else if (AddrSinkUsingGEPs \|\|
	(!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
	SubtargetInfo->useAA())) {
	// By default, we use the GEP-based method when AA is used later. This
	// prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
	DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
	<< *MemoryInst << "\n");
	Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
	Value ResultPtr = nullptr, ResultIndex = nullptr;

	// First, find the pointer.
	if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
	ResultPtr = AddrMode.BaseReg;
	AddrMode.BaseReg = nullptr;
	}

	if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
	// We can't add more than one pointer together, nor can we scale a
	// pointer (both of which seem meaningless).
	if (ResultPtr \|\| AddrMode.Scale != 1)
	return false;

	ResultPtr = AddrMode.ScaledReg;
	AddrMode.Scale = 0;
	}

	// It is only safe to sign extend the BaseReg if we know that the math
	// required to create it did not overflow before we extend it. Since
	// the original IR value was tossed in favor of a constant back when
	// the AddrMode was created we need to bail out gracefully if widths
	// do not match instead of extending it.
	//
	// (See below for code to add the scale.)
	if (AddrMode.Scale) {
	Type *ScaledRegTy = AddrMode.ScaledReg->getType();
	if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
	cast<IntegerType>(ScaledRegTy)->getBitWidth())
	return false;
	}

	if (AddrMode.BaseGV) {
	if (ResultPtr)
	return false;

	ResultPtr = AddrMode.BaseGV;
	}

	// If the real base value actually came from an inttoptr, then the matcher
	// will look through it and provide only the integer value. In that case,
	// use it here.
	if (!DL->isNonIntegralPointerType(Addr->getType())) {
	if (!ResultPtr && AddrMode.BaseReg) {
	ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
	"sunkaddr");
	AddrMode.BaseReg = nullptr;
	} else if (!ResultPtr && AddrMode.Scale == 1) {
	ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
	"sunkaddr");
	AddrMode.Scale = 0;
	}
	}

	if (!ResultPtr &&
	!AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
	SunkAddr = Constant::getNullValue(Addr->getType());
	} else if (!ResultPtr) {
	return false;
	} else {
	Type *I8PtrTy =
	Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
	Type *I8Ty = Builder.getInt8Ty();

	// Start with the base register. Do this first so that subsequent address
	// matching finds it last, which will prevent it from trying to match it
	// as the scaled value in case it happens to be a mul. That would be
	// problematic if we've sunk a different mul for the scale, because then
	// we'd end up sinking both muls.
	if (AddrMode.BaseReg) {
	Value *V = AddrMode.BaseReg;
	if (V->getType() != IntPtrTy)
	V = Builder.CreateIntCast(V, IntPtrTy, /isSigned=/true, "sunkaddr");

	ResultIndex = V;
	}

	// Add the scale value.
	if (AddrMode.Scale) {
	Value *V = AddrMode.ScaledReg;
	if (V->getType() == IntPtrTy) {
	// done.
	} else {
	assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
	cast<IntegerType>(V->getType())->getBitWidth() &&
	"We can't transform if ScaledReg is too narrow");
	V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
	}

	if (AddrMode.Scale != 1)
	V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
	"sunkaddr");
	if (ResultIndex)
	ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
	else
	ResultIndex = V;
	}

	// Add in the Base Offset if present.
	if (AddrMode.BaseOffs) {
	Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
	if (ResultIndex) {
	// We need to add this separately from the scale above to help with
	// SDAG consecutive load/store merging.
	if (ResultPtr->getType() != I8PtrTy)
	ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
	ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
	}

	ResultIndex = V;
	}

	if (!ResultIndex) {
	SunkAddr = ResultPtr;
	} else {
	if (ResultPtr->getType() != I8PtrTy)
	ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
	SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
	}

	if (SunkAddr->getType() != Addr->getType())
	SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
	}
	} else {
	// We'd require a ptrtoint/inttoptr down the line, which we can't do for
	// non-integral pointers, so in that case bail out now.
	Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
	Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
	PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
	PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
	if (DL->isNonIntegralPointerType(Addr->getType()) \|\|
	(BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) \|\|
	(ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) \|\|
	(AddrMode.BaseGV &&
	DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
	return false;

	DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
	<< *MemoryInst << "\n");
	Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
	Value *Result = nullptr;

	// Start with the base register. Do this first so that subsequent address
	// matching finds it last, which will prevent it from trying to match it
	// as the scaled value in case it happens to be a mul. That would be
	// problematic if we've sunk a different mul for the scale, because then
	// we'd end up sinking both muls.
	if (AddrMode.BaseReg) {
	Value *V = AddrMode.BaseReg;
	if (V->getType()->isPointerTy())
	V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
	if (V->getType() != IntPtrTy)
	V = Builder.CreateIntCast(V, IntPtrTy, /isSigned=/true, "sunkaddr");
	Result = V;
	}

	// Add the scale value.
	if (AddrMode.Scale) {
	Value *V = AddrMode.ScaledReg;
	if (V->getType() == IntPtrTy) {
	// done.
	} else if (V->getType()->isPointerTy()) {
	V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
	} else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
	cast<IntegerType>(V->getType())->getBitWidth()) {
	V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
	} else {
	// It is only safe to sign extend the BaseReg if we know that the math
	// required to create it did not overflow before we extend it. Since
	// the original IR value was tossed in favor of a constant back when
	// the AddrMode was created we need to bail out gracefully if widths
	// do not match instead of extending it.
	Instruction *I = dyn_cast_or_null<Instruction>(Result);
	if (I && (Result != AddrMode.BaseReg))
	I->eraseFromParent();
	return false;
	}
	if (AddrMode.Scale != 1)
	V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
	"sunkaddr");
	if (Result)
	Result = Builder.CreateAdd(Result, V, "sunkaddr");
	else
	Result = V;
	}

	// Add in the BaseGV if present.
	if (AddrMode.BaseGV) {
	Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
	if (Result)
	Result = Builder.CreateAdd(Result, V, "sunkaddr");
	else
	Result = V;
	}

	// Add in the Base Offset if present.
	if (AddrMode.BaseOffs) {
	Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
	if (Result)
	Result = Builder.CreateAdd(Result, V, "sunkaddr");
	else
	Result = V;
	}

	if (!Result)
	SunkAddr = Constant::getNullValue(Addr->getType());
	else
	SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
	}

	MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
	// Store the newly computed address into the cache. In the case we reused a
	// value, this should be idempotent.
	SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);

	// If we have no uses, recursively delete the value and all dead instructions
	// using it.
	if (Repl->use_empty()) {
	// This can cause recursive deletion, which can invalidate our iterator.
	// Use a WeakTrackingVH to hold onto it in case this happens.
	Value CurValue = &CurInstIterator;
	WeakTrackingVH IterHandle(CurValue);
	BasicBlock *BB = CurInstIterator->getParent();

	RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);

	if (IterHandle != CurValue) {
	// If the iterator instruction was recursively deleted, start over at the
	// start of the block.
	CurInstIterator = BB->begin();
	SunkAddrs.clear();
	}
	}
	++NumMemoryInsts;
	return true;
	}

	/// If there are any memory operands, use OptimizeMemoryInst to sink their
	/// address computing into the block when possible / profitable.
	bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
	bool MadeChange = false;

	const TargetRegisterInfo *TRI =
	TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
	TargetLowering::AsmOperandInfoVector TargetConstraints =
	TLI->ParseConstraints(*DL, TRI, CS);
	unsigned ArgNo = 0;
	for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
	TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];

	// Compute the constraint code and ConstraintType to use.
	TLI->ComputeConstraintToUse(OpInfo, SDValue());

	if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
	OpInfo.isIndirect) {
	Value *OpVal = CS->getArgOperand(ArgNo++);
	MadeChange \|= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
	} else if (OpInfo.Type == InlineAsm::isInput)
	ArgNo++;
	}

	return MadeChange;
	}

	/// \brief Check if all the uses of \p Val are equivalent (or free) zero or
	/// sign extensions.
	static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
	assert(!Val->use_empty() && "Input must have at least one use");
	const Instruction FirstUser = cast<Instruction>(Val->user_begin());
	bool IsSExt = isa<SExtInst>(FirstUser);
	Type *ExtTy = FirstUser->getType();
	for (const User *U : Val->users()) {
	const Instruction *UI = cast<Instruction>(U);
	if ((IsSExt && !isa<SExtInst>(UI)) \|\| (!IsSExt && !isa<ZExtInst>(UI)))
	return false;
	Type *CurTy = UI->getType();
	// Same input and output types: Same instruction after CSE.
	if (CurTy == ExtTy)
	continue;

	// If IsSExt is true, we are in this situation:
	// a = Val
	// b = sext ty1 a to ty2
	// c = sext ty1 a to ty3
	// Assuming ty2 is shorter than ty3, this could be turned into:
	// a = Val
	// b = sext ty1 a to ty2
	// c = sext ty2 b to ty3
	// However, the last sext is not free.
	if (IsSExt)
	return false;

	// This is a ZExt, maybe this is free to extend from one type to another.
	// In that case, we would not account for a different use.
	Type *NarrowTy;
	Type *LargeTy;
	if (ExtTy->getScalarType()->getIntegerBitWidth() >
	CurTy->getScalarType()->getIntegerBitWidth()) {
	NarrowTy = CurTy;
	LargeTy = ExtTy;
	} else {
	NarrowTy = ExtTy;
	LargeTy = CurTy;
	}

	if (!TLI.isZExtFree(NarrowTy, LargeTy))
	return false;
	}
	// All uses are the same or can be derived from one another for free.
	return true;
	}

	/// \brief Try to speculatively promote extensions in \p Exts and continue
	/// promoting through newly promoted operands recursively as far as doing so is
	/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
	/// When some promotion happened, \p TPT contains the proper state to revert
	/// them.
	///
	/// \return true if some promotion happened, false otherwise.
	bool CodeGenPrepare::tryToPromoteExts(
	TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
	SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
	unsigned CreatedInstsCost) {
	bool Promoted = false;

	// Iterate over all the extensions to try to promote them.
	for (auto I : Exts) {
	// Early check if we directly have ext(load).
	if (isa<LoadInst>(I->getOperand(0))) {
	ProfitablyMovedExts.push_back(I);
	continue;
	}

	// Check whether or not we want to do any promotion. The reason we have
	// this check inside the for loop is to catch the case where an extension
	// is directly fed by a load because in such case the extension can be moved
	// up without any promotion on its operands.
	if (!TLI \|\| !TLI->enableExtLdPromotion() \|\| DisableExtLdPromotion)
	return false;

	// Get the action to perform the promotion.
	TypePromotionHelper::Action TPH =
	TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
	// Check if we can promote.
	if (!TPH) {
	// Save the current extension as we cannot move up through its operand.
	ProfitablyMovedExts.push_back(I);
	continue;
	}

	// Save the current state.
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	SmallVector<Instruction *, 4> NewExts;
	unsigned NewCreatedInstsCost = 0;
	unsigned ExtCost = !TLI->isExtFree(I);
	// Promote.
	Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
	&NewExts, nullptr, *TLI);
	assert(PromotedVal &&
	"TypePromotionHelper should have filtered out those cases");

	// We would be able to merge only one extension in a load.
	// Therefore, if we have more than 1 new extension we heuristically
	// cut this search path, because it means we degrade the code quality.
	// With exactly 2, the transformation is neutral, because we will merge
	// one extension but leave one. However, we optimistically keep going,
	// because the new extension may be removed too.
	long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
	// FIXME: It would be possible to propagate a negative value instead of
	// conservatively ceiling it to 0.
	TotalCreatedInstsCost =
	std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
	if (!StressExtLdPromotion &&
	(TotalCreatedInstsCost > 1 \|\|
	!isPromotedInstructionLegal(TLI, DL, PromotedVal))) {
	// This promotion is not profitable, rollback to the previous state, and
	// save the current extension in ProfitablyMovedExts as the latest
	// speculative promotion turned out to be unprofitable.
	TPT.rollback(LastKnownGood);
	ProfitablyMovedExts.push_back(I);
	continue;
	}
	// Continue promoting NewExts as far as doing so is profitable.
	SmallVector<Instruction *, 2> NewlyMovedExts;
	(void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
	bool NewPromoted = false;
	for (auto ExtInst : NewlyMovedExts) {
	Instruction *MovedExt = cast<Instruction>(ExtInst);
	Value *ExtOperand = MovedExt->getOperand(0);
	// If we have reached to a load, we need this extra profitability check
	// as it could potentially be merged into an ext(load).
	if (isa<LoadInst>(ExtOperand) &&
	!(StressExtLdPromotion \|\| NewCreatedInstsCost <= ExtCost \|\|
	(ExtOperand->hasOneUse() \|\| hasSameExtUse(ExtOperand, *TLI))))
	continue;

	ProfitablyMovedExts.push_back(MovedExt);
	NewPromoted = true;
	}

	// If none of speculative promotions for NewExts is profitable, rollback
	// and save the current extension (I) as the last profitable extension.
	if (!NewPromoted) {
	TPT.rollback(LastKnownGood);
	ProfitablyMovedExts.push_back(I);
	continue;
	}
	// The promotion is profitable.
	Promoted = true;
	}
	return Promoted;
	}

	/// Merging redundant sexts when one is dominating the other.
	bool CodeGenPrepare::mergeSExts(Function &F) {
	DominatorTree DT(F);
	bool Changed = false;
	for (auto &Entry : ValToSExtendedUses) {
	SExts &Insts = Entry.second;
	SExts CurPts;
	for (Instruction *Inst : Insts) {
	if (RemovedInsts.count(Inst) \|\| !isa<SExtInst>(Inst) \|\|
	Inst->getOperand(0) != Entry.first)
	continue;
	bool inserted = false;
	for (auto &Pt : CurPts) {
	if (DT.dominates(Inst, Pt)) {
	Pt->replaceAllUsesWith(Inst);
	RemovedInsts.insert(Pt);
	Pt->removeFromParent();
	Pt = Inst;
	inserted = true;
	Changed = true;
	break;
	}
	if (!DT.dominates(Pt, Inst))
	// Give up if we need to merge in a common dominator as the
	// expermients show it is not profitable.
	continue;
	Inst->replaceAllUsesWith(Pt);
	RemovedInsts.insert(Inst);
	Inst->removeFromParent();
	inserted = true;
	Changed = true;
	break;
	}
	if (!inserted)
	CurPts.push_back(Inst);
	}
	}
	return Changed;
	}

	/// Return true, if an ext(load) can be formed from an extension in
	/// \p MovedExts.
	bool CodeGenPrepare::canFormExtLd(
	const SmallVectorImpl<Instruction > &MovedExts, LoadInst &LI,
	Instruction *&Inst, bool HasPromoted) {
	for (auto *MovedExtInst : MovedExts) {
	if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
	LI = cast<LoadInst>(MovedExtInst->getOperand(0));
	Inst = MovedExtInst;
	break;
	}
	}
	if (!LI)
	return false;

	// If they're already in the same block, there's nothing to do.
	// Make the cheap checks first if we did not promote.
	// If we promoted, we need to check if it is indeed profitable.
	if (!HasPromoted && LI->getParent() == Inst->getParent())
	return false;

	return TLI->isExtLoad(LI, Inst, *DL);
	}

	/// Move a zext or sext fed by a load into the same basic block as the load,
	/// unless conditions are unfavorable. This allows SelectionDAG to fold the
	/// extend into the load.
	///
	/// E.g.,
	/// \code
	/// %ld = load i32* %addr
	/// %add = add nuw i32 %ld, 4
	/// %zext = zext i32 %add to i64
	// \endcode
	/// =>
	/// \code
	/// %ld = load i32* %addr
	/// %zext = zext i32 %ld to i64
	/// %add = add nuw i64 %zext, 4
	/// \encode
	/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
	/// allow us to match zext(load i32*) to i64.
	///
	/// Also, try to promote the computations used to obtain a sign extended
	/// value used into memory accesses.
	/// E.g.,
	/// \code
	/// a = add nsw i32 b, 3
	/// d = sext i32 a to i64
	/// e = getelementptr ..., i64 d
	/// \endcode
	/// =>
	/// \code
	/// f = sext i32 b to i64
	/// a = add nsw i64 f, 3
	/// e = getelementptr ..., i64 a
	/// \endcode
	///
	/// \p Inst[in/out] the extension may be modified during the process if some
	/// promotions apply.
	bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
	// ExtLoad formation and address type promotion infrastructure requires TLI to
	// be effective.
	if (!TLI)
	return false;

	bool AllowPromotionWithoutCommonHeader = false;
	/// See if it is an interesting sext operations for the address type
	/// promotion before trying to promote it, e.g., the ones with the right
	/// type and used in memory accesses.
	bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
	*Inst, AllowPromotionWithoutCommonHeader);
	TypePromotionTransaction TPT(RemovedInsts);
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	SmallVector<Instruction *, 1> Exts;
	SmallVector<Instruction *, 2> SpeculativelyMovedExts;
	Exts.push_back(Inst);

	bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);

	// Look for a load being extended.
	LoadInst *LI = nullptr;
	Instruction *ExtFedByLoad;

	// Try to promote a chain of computation if it allows to form an extended
	// load.
	if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
	assert(LI && ExtFedByLoad && "Expect a valid load and extension");
	TPT.commit();
	// Move the extend into the same block as the load
	ExtFedByLoad->moveAfter(LI);
	// CGP does not check if the zext would be speculatively executed when moved
	// to the same basic block as the load. Preserving its original location
	// would pessimize the debugging experience, as well as negatively impact
	// the quality of sample pgo. We don't want to use "line 0" as that has a
	// size cost in the line-table section and logically the zext can be seen as
	// part of the load. Therefore we conservatively reuse the same debug
	// location for the load and the zext.
	ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
	++NumExtsMoved;
	Inst = ExtFedByLoad;
	return true;
	}

	// Continue promoting SExts if known as considerable depending on targets.
	if (ATPConsiderable &&
	performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
	HasPromoted, TPT, SpeculativelyMovedExts))
	return true;

	TPT.rollback(LastKnownGood);
	return false;
	}

	// Perform address type promotion if doing so is profitable.
	// If AllowPromotionWithoutCommonHeader == false, we should find other sext
	// instructions that sign extended the same initial value. However, if
	// AllowPromotionWithoutCommonHeader == true, we expect promoting the
	// extension is just profitable.
	bool CodeGenPrepare::performAddressTypePromotion(
	Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
	bool HasPromoted, TypePromotionTransaction &TPT,
	SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
	bool Promoted = false;
	SmallPtrSet<Instruction *, 1> UnhandledExts;
	bool AllSeenFirst = true;
	for (auto I : SpeculativelyMovedExts) {
	Value *HeadOfChain = I->getOperand(0);
	DenseMap<Value , Instruction >::iterator AlreadySeen =
	SeenChainsForSExt.find(HeadOfChain);
	// If there is an unhandled SExt which has the same header, try to promote
	// it as well.
	if (AlreadySeen != SeenChainsForSExt.end()) {
	if (AlreadySeen->second != nullptr)
	UnhandledExts.insert(AlreadySeen->second);
	AllSeenFirst = false;
	}
	}

	if (!AllSeenFirst \|\| (AllowPromotionWithoutCommonHeader &&
	SpeculativelyMovedExts.size() == 1)) {
	TPT.commit();
	if (HasPromoted)
	Promoted = true;
	for (auto I : SpeculativelyMovedExts) {
	Value *HeadOfChain = I->getOperand(0);
	SeenChainsForSExt[HeadOfChain] = nullptr;
	ValToSExtendedUses[HeadOfChain].push_back(I);
	}
	// Update Inst as promotion happen.
	Inst = SpeculativelyMovedExts.pop_back_val();
	} else {
	// This is the first chain visited from the header, keep the current chain
	// as unhandled. Defer to promote this until we encounter another SExt
	// chain derived from the same header.
	for (auto I : SpeculativelyMovedExts) {
	Value *HeadOfChain = I->getOperand(0);
	SeenChainsForSExt[HeadOfChain] = Inst;
	}
	return false;
	}

	if (!AllSeenFirst && !UnhandledExts.empty())
	for (auto VisitedSExt : UnhandledExts) {
	if (RemovedInsts.count(VisitedSExt))
	continue;
	TypePromotionTransaction TPT(RemovedInsts);
	SmallVector<Instruction *, 1> Exts;
	SmallVector<Instruction *, 2> Chains;
	Exts.push_back(VisitedSExt);
	bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
	TPT.commit();
	if (HasPromoted)
	Promoted = true;
	for (auto I : Chains) {
	Value *HeadOfChain = I->getOperand(0);
	// Mark this as handled.
	SeenChainsForSExt[HeadOfChain] = nullptr;
	ValToSExtendedUses[HeadOfChain].push_back(I);
	}
	}
	return Promoted;
	}

	bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
	BasicBlock *DefBB = I->getParent();

	// If the result of a {s\|z}ext and its source are both live out, rewrite all
	// other uses of the source with result of extension.
	Value *Src = I->getOperand(0);
	if (Src->hasOneUse())
	return false;

	// Only do this xform if truncating is free.
	if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
	return false;

	// Only safe to perform the optimization if the source is also defined in
	// this block.
	if (!isa<Instruction>(Src) \|\| DefBB != cast<Instruction>(Src)->getParent())
	return false;

	bool DefIsLiveOut = false;
	for (User *U : I->users()) {
	Instruction *UI = cast<Instruction>(U);

	// Figure out which BB this ext is used in.
	BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;
	DefIsLiveOut = true;
	break;
	}
	if (!DefIsLiveOut)
	return false;

	// Make sure none of the uses are PHI nodes.
	for (User *U : Src->users()) {
	Instruction *UI = cast<Instruction>(U);
	BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;
	// Be conservative. We don't want this xform to end up introducing
	// reloads just before load / store instructions.
	if (isa<PHINode>(UI) \|\| isa<LoadInst>(UI) \|\| isa<StoreInst>(UI))
	return false;
	}

	// InsertedTruncs - Only insert one trunc in each block once.
	DenseMap<BasicBlock, Instruction> InsertedTruncs;

	bool MadeChange = false;
	for (Use &U : Src->uses()) {
	Instruction *User = cast<Instruction>(U.getUser());

	// Figure out which BB this ext is used in.
	BasicBlock *UserBB = User->getParent();
	if (UserBB == DefBB) continue;

	// Both src and def are live in this block. Rewrite the use.
	Instruction *&InsertedTrunc = InsertedTruncs[UserBB];

	if (!InsertedTrunc) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
	InsertedInsts.insert(InsertedTrunc);
	}

	// Replace a use of the {s\|z}ext source with a use of the result.
	U = InsertedTrunc;
	++NumExtUses;
	MadeChange = true;
	}

	return MadeChange;
	}

	// Find loads whose uses only use some of the loaded value's bits. Add an "and"
	// just after the load if the target can fold this into one extload instruction,
	// with the hope of eliminating some of the other later "and" instructions using
	// the loaded value. "and"s that are made trivially redundant by the insertion
	// of the new "and" are removed by this function, while others (e.g. those whose
	// path from the load goes through a phi) are left for isel to potentially
	// remove.
	//
	// For example:
	//
	// b0:
	// x = load i32
	// ...
	// b1:
	// y = and x, 0xff
	// z = use y
	//
	// becomes:
	//
	// b0:
	// x = load i32
	// x' = and x, 0xff
	// ...
	// b1:
	// z = use x'
	//
	// whereas:
	//
	// b0:
	// x1 = load i32
	// ...
	// b1:
	// x2 = load i32
	// ...
	// b2:
	// x = phi x1, x2
	// y = and x, 0xff
	//
	// becomes (after a call to optimizeLoadExt for each load):
	//
	// b0:
	// x1 = load i32
	// x1' = and x1, 0xff
	// ...
	// b1:
	// x2 = load i32
	// x2' = and x2, 0xff
	// ...
	// b2:
	// x = phi x1', x2'
	// y = and x, 0xff
	bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
	if (!Load->isSimple() \|\|
	!(Load->getType()->isIntegerTy() \|\| Load->getType()->isPointerTy()))
	return false;

	// Skip loads we've already transformed.
	if (Load->hasOneUse() &&
	InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
	return false;

	// Look at all uses of Load, looking through phis, to determine how many bits
	// of the loaded value are needed.
	SmallVector<Instruction *, 8> WorkList;
	SmallPtrSet<Instruction *, 16> Visited;
	SmallVector<Instruction *, 8> AndsToMaybeRemove;
	for (auto *U : Load->users())
	WorkList.push_back(cast<Instruction>(U));

	EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
	unsigned BitWidth = LoadResultVT.getSizeInBits();
	APInt DemandBits(BitWidth, 0);
	APInt WidestAndBits(BitWidth, 0);

	while (!WorkList.empty()) {
	Instruction *I = WorkList.back();
	WorkList.pop_back();

	// Break use-def graph loops.
	if (!Visited.insert(I).second)
	continue;

	// For a PHI node, push all of its users.
	if (auto *Phi = dyn_cast<PHINode>(I)) {
	for (auto *U : Phi->users())
	WorkList.push_back(cast<Instruction>(U));
	continue;
	}

	switch (I->getOpcode()) {
	case Instruction::And: {
	auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
	if (!AndC)
	return false;
	APInt AndBits = AndC->getValue();
	DemandBits \|= AndBits;
	// Keep track of the widest and mask we see.
	if (AndBits.ugt(WidestAndBits))
	WidestAndBits = AndBits;
	if (AndBits == WidestAndBits && I->getOperand(0) == Load)
	AndsToMaybeRemove.push_back(I);
	break;
	}

	case Instruction::Shl: {
	auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
	if (!ShlC)
	return false;
	uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
	DemandBits.setLowBits(BitWidth - ShiftAmt);
	break;
	}

	case Instruction::Trunc: {
	EVT TruncVT = TLI->getValueType(*DL, I->getType());
	unsigned TruncBitWidth = TruncVT.getSizeInBits();
	DemandBits.setLowBits(TruncBitWidth);
	break;
	}

	default:
	return false;
	}
	}

	uint32_t ActiveBits = DemandBits.getActiveBits();
	// Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
	// target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,
	// for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
	// (and (load x) 1) is not matched as a single instruction, rather as a LDR
	// followed by an AND.
	// TODO: Look into removing this restriction by fixing backends to either
	// return false for isLoadExtLegal for i1 or have them select this pattern to
	// a single instruction.
	//
	// Also avoid hoisting if we didn't see any ands with the exact DemandBits
	// mask, since these are the only ands that will be removed by isel.
	if (ActiveBits <= 1 \|\| !DemandBits.isMask(ActiveBits) \|\|
	WidestAndBits != DemandBits)
	return false;

	LLVMContext &Ctx = Load->getType()->getContext();
	Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
	EVT TruncVT = TLI->getValueType(*DL, TruncTy);

	// Reject cases that won't be matched as extloads.
	if (!LoadResultVT.bitsGT(TruncVT) \|\| !TruncVT.isRound() \|\|
	!TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
	return false;

	IRBuilder<> Builder(Load->getNextNode());
	auto *NewAnd = dyn_cast<Instruction>(
	Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
	// Mark this instruction as "inserted by CGP", so that other
	// optimizations don't touch it.
	InsertedInsts.insert(NewAnd);

	// Replace all uses of load with new and (except for the use of load in the
	// new and itself).
	Load->replaceAllUsesWith(NewAnd);
	NewAnd->setOperand(0, Load);

	// Remove any and instructions that are now redundant.
	for (auto *And : AndsToMaybeRemove)
	// Check that the and mask is the same as the one we decided to put on the
	// new and.
	if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
	And->replaceAllUsesWith(NewAnd);
	if (&*CurInstIterator == And)
	CurInstIterator = std::next(And->getIterator());
	And->eraseFromParent();
	++NumAndUses;
	}

	++NumAndsAdded;
	return true;
	}

	/// Check if V (an operand of a select instruction) is an expensive instruction
	/// that is only used once.
	static bool sinkSelectOperand(const TargetTransformInfo TTI, Value V) {
	auto *I = dyn_cast<Instruction>(V);
	// If it's safe to speculatively execute, then it should not have side
	// effects; therefore, it's safe to sink and possibly not execute.
	return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
	TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
	}

	/// Returns true if a SelectInst should be turned into an explicit branch.
	static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
	const TargetLowering *TLI,
	SelectInst *SI) {
	// If even a predictable select is cheap, then a branch can't be cheaper.
	if (!TLI->isPredictableSelectExpensive())
	return false;

	// FIXME: This should use the same heuristics as IfConversion to determine
	// whether a select is better represented as a branch.

	// If metadata tells us that the select condition is obviously predictable,
	// then we want to replace the select with a branch.
	uint64_t TrueWeight, FalseWeight;
	if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
	uint64_t Max = std::max(TrueWeight, FalseWeight);
	uint64_t Sum = TrueWeight + FalseWeight;
	if (Sum != 0) {
	auto Probability = BranchProbability::getBranchProbability(Max, Sum);
	if (Probability > TLI->getPredictableBranchThreshold())
	return true;
	}
	}

	CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());

	// If a branch is predictable, an out-of-order CPU can avoid blocking on its
	// comparison condition. If the compare has more than one use, there's
	// probably another cmov or setcc around, so it's not worth emitting a branch.
	if (!Cmp \|\| !Cmp->hasOneUse())
	return false;

	// If either operand of the select is expensive and only needed on one side
	// of the select, we should form a branch.
	if (sinkSelectOperand(TTI, SI->getTrueValue()) \|\|
	sinkSelectOperand(TTI, SI->getFalseValue()))
	return true;

	return false;
	}

	/// If \p isTrue is true, return the true value of \p SI, otherwise return
	/// false value of \p SI. If the true/false value of \p SI is defined by any
	/// select instructions in \p Selects, look through the defining select
	/// instruction until the true/false value is not defined in \p Selects.
	static Value *getTrueOrFalseValue(
	SelectInst *SI, bool isTrue,
	const SmallPtrSet<const Instruction *, 2> &Selects) {
	Value *V;

	for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
	DefSI = dyn_cast<SelectInst>(V)) {
	assert(DefSI->getCondition() == SI->getCondition() &&
	"The condition of DefSI does not match with SI");
	V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
	}
	return V;
	}

	/// If we have a SelectInst that will likely profit from branch prediction,
	/// turn it into a branch.
	bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
	// Find all consecutive select instructions that share the same condition.
	SmallVector<SelectInst *, 2> ASI;
	ASI.push_back(SI);
	for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
	It != SI->getParent()->end(); ++It) {
	SelectInst I = dyn_cast<SelectInst>(&It);
	if (I && SI->getCondition() == I->getCondition()) {
	ASI.push_back(I);
	} else {
	break;
	}
	}

	SelectInst *LastSI = ASI.back();
	// Increment the current iterator to skip all the rest of select instructions
	// because they will be either "not lowered" or "all lowered" to branch.
	CurInstIterator = std::next(LastSI->getIterator());

	bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);

	// Can we convert the 'select' to CF ?
	if (DisableSelectToBranch \|\| OptSize \|\| !TLI \|\| VectorCond \|\|
	SI->getMetadata(LLVMContext::MD_unpredictable))
	return false;

	TargetLowering::SelectSupportKind SelectKind;
	if (VectorCond)
	SelectKind = TargetLowering::VectorMaskSelect;
	else if (SI->getType()->isVectorTy())
	SelectKind = TargetLowering::ScalarCondVectorVal;
	else
	SelectKind = TargetLowering::ScalarValSelect;

	if (TLI->isSelectSupported(SelectKind) &&
	!isFormingBranchFromSelectProfitable(TTI, TLI, SI))
	return false;

	ModifiedDT = true;

	// Transform a sequence like this:
	// start:
	// %cmp = cmp uge i32 %a, %b
	// %sel = select i1 %cmp, i32 %c, i32 %d
	//
	// Into:
	// start:
	// %cmp = cmp uge i32 %a, %b
	// br i1 %cmp, label %select.true, label %select.false
	// select.true:
	// br label %select.end
	// select.false:
	// br label %select.end
	// select.end:
	// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
	//
	// In addition, we may sink instructions that produce %c or %d from
	// the entry block into the destination(s) of the new branch.
	// If the true or false blocks do not contain a sunken instruction, that
	// block and its branch may be optimized away. In that case, one side of the
	// first branch will point directly to select.end, and the corresponding PHI
	// predecessor block will be the start block.

	// First, we split the block containing the select into 2 blocks.
	BasicBlock *StartBlock = SI->getParent();
	BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
	BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");

	// Delete the unconditional branch that was just created by the split.
	StartBlock->getTerminator()->eraseFromParent();

	// These are the new basic blocks for the conditional branch.
	// At least one will become an actual new basic block.
	BasicBlock *TrueBlock = nullptr;
	BasicBlock *FalseBlock = nullptr;
	BranchInst *TrueBranch = nullptr;
	BranchInst *FalseBranch = nullptr;

	// Sink expensive instructions into the conditional blocks to avoid executing
	// them speculatively.
	for (SelectInst *SI : ASI) {
	if (sinkSelectOperand(TTI, SI->getTrueValue())) {
	if (TrueBlock == nullptr) {
	TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
	EndBlock->getParent(), EndBlock);
	TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
	}
	auto *TrueInst = cast<Instruction>(SI->getTrueValue());
	TrueInst->moveBefore(TrueBranch);
	}
	if (sinkSelectOperand(TTI, SI->getFalseValue())) {
	if (FalseBlock == nullptr) {
	FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
	EndBlock->getParent(), EndBlock);
	FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
	}
	auto *FalseInst = cast<Instruction>(SI->getFalseValue());
	FalseInst->moveBefore(FalseBranch);
	}
	}

	// If there was nothing to sink, then arbitrarily choose the 'false' side
	// for a new input value to the PHI.
	if (TrueBlock == FalseBlock) {
	assert(TrueBlock == nullptr &&
	"Unexpected basic block transform while optimizing select");

	FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
	EndBlock->getParent(), EndBlock);
	BranchInst::Create(EndBlock, FalseBlock);
	}

	// Insert the real conditional branch based on the original condition.
	// If we did not create a new block for one of the 'true' or 'false' paths
	// of the condition, it means that side of the branch goes to the end block
	// directly and the path originates from the start block from the point of
	// view of the new PHI.
	BasicBlock TT, FT;
	if (TrueBlock == nullptr) {
	TT = EndBlock;
	FT = FalseBlock;
	TrueBlock = StartBlock;
	} else if (FalseBlock == nullptr) {
	TT = TrueBlock;
	FT = EndBlock;
	FalseBlock = StartBlock;
	} else {
	TT = TrueBlock;
	FT = FalseBlock;
	}
	IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);

	SmallPtrSet<const Instruction *, 2> INS;
	INS.insert(ASI.begin(), ASI.end());
	// Use reverse iterator because later select may use the value of the
	// earlier select, and we need to propagate value through earlier select
	// to get the PHI operand.
	for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
	SelectInst SI = It;
	// The select itself is replaced with a PHI Node.
	PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
	PN->takeName(SI);
	PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
	PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);

	SI->replaceAllUsesWith(PN);
	SI->eraseFromParent();
	INS.erase(SI);
	++NumSelectsExpanded;
	}

	// Instruct OptimizeBlock to skip to the next block.
	CurInstIterator = StartBlock->end();
	return true;
	}

	static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
	SmallVector<int, 16> Mask(SVI->getShuffleMask());
	int SplatElem = -1;
	for (unsigned i = 0; i < Mask.size(); ++i) {
	if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
	return false;
	SplatElem = Mask[i];
	}

	return true;
	}

	/// Some targets have expensive vector shifts if the lanes aren't all the same
	/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
	/// it's often worth sinking a shufflevector splat down to its use so that
	/// codegen can spot all lanes are identical.
	bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
	BasicBlock *DefBB = SVI->getParent();

	// Only do this xform if variable vector shifts are particularly expensive.
	if (!TLI \|\| !TLI->isVectorShiftByScalarCheap(SVI->getType()))
	return false;

	// We only expect better codegen by sinking a shuffle if we can recognise a
	// constant splat.
	if (!isBroadcastShuffle(SVI))
	return false;

	// InsertedShuffles - Only insert a shuffle in each block once.
	DenseMap<BasicBlock, Instruction> InsertedShuffles;

	bool MadeChange = false;
	for (User *U : SVI->users()) {
	Instruction *UI = cast<Instruction>(U);

	// Figure out which BB this ext is used in.
	BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;

	// For now only apply this when the splat is used by a shift instruction.
	if (!UI->isShift()) continue;

	// Everything checks out, sink the shuffle if the user's block doesn't
	// already have a copy.
	Instruction *&InsertedShuffle = InsertedShuffles[UserBB];

	if (!InsertedShuffle) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedShuffle =
	new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
	SVI->getOperand(2), "", &*InsertPt);
	}

	UI->replaceUsesOfWith(SVI, InsertedShuffle);
	MadeChange = true;
	}

	// If we removed all uses, nuke the shuffle.
	if (SVI->use_empty()) {
	SVI->eraseFromParent();
	MadeChange = true;
	}

	return MadeChange;
	}

	bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
	if (!TLI \|\| !DL)
	return false;

	Value *Cond = SI->getCondition();
	Type *OldType = Cond->getType();
	LLVMContext &Context = Cond->getContext();
	MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
	unsigned RegWidth = RegType.getSizeInBits();

	if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
	return false;

	// If the register width is greater than the type width, expand the condition
	// of the switch instruction and each case constant to the width of the
	// register. By widening the type of the switch condition, subsequent
	// comparisons (for case comparisons) will not need to be extended to the
	// preferred register width, so we will potentially eliminate N-1 extends,
	// where N is the number of cases in the switch.
	auto *NewType = Type::getIntNTy(Context, RegWidth);

	// Zero-extend the switch condition and case constants unless the switch
	// condition is a function argument that is already being sign-extended.
	// In that case, we can avoid an unnecessary mask/extension by sign-extending
	// everything instead.
	Instruction::CastOps ExtType = Instruction::ZExt;
	if (auto *Arg = dyn_cast<Argument>(Cond))
	if (Arg->hasSExtAttr())
	ExtType = Instruction::SExt;

	auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
	ExtInst->insertBefore(SI);
	SI->setCondition(ExtInst);
	for (auto Case : SI->cases()) {
	APInt NarrowConst = Case.getCaseValue()->getValue();
	APInt WideConst = (ExtType == Instruction::ZExt) ?
	NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
	Case.setValue(ConstantInt::get(Context, WideConst));
	}

	return true;
	}


	namespace {

	/// \brief Helper class to promote a scalar operation to a vector one.
	/// This class is used to move downward extractelement transition.
	/// E.g.,
	/// a = vector_op <2 x i32>
	/// b = extractelement <2 x i32> a, i32 0
	/// c = scalar_op b
	/// store c
	///
	/// =>
	/// a = vector_op <2 x i32>
	/// c = vector_op a (equivalent to scalar_op on the related lane)
	/// * d = extractelement <2 x i32> c, i32 0
	/// * store d
	/// Assuming both extractelement and store can be combine, we get rid of the
	/// transition.
	class VectorPromoteHelper {
	/// DataLayout associated with the current module.
	const DataLayout &DL;

	/// Used to perform some checks on the legality of vector operations.
	const TargetLowering &TLI;

	/// Used to estimated the cost of the promoted chain.
	const TargetTransformInfo &TTI;

	/// The transition being moved downwards.
	Instruction *Transition;

	/// The sequence of instructions to be promoted.
	SmallVector<Instruction *, 4> InstsToBePromoted;

	/// Cost of combining a store and an extract.
	unsigned StoreExtractCombineCost;

	/// Instruction that will be combined with the transition.
	Instruction *CombineInst = nullptr;

	/// \brief The instruction that represents the current end of the transition.
	/// Since we are faking the promotion until we reach the end of the chain
	/// of computation, we need a way to get the current end of the transition.
	Instruction *getEndOfTransition() const {
	if (InstsToBePromoted.empty())
	return Transition;
	return InstsToBePromoted.back();
	}

	/// \brief Return the index of the original value in the transition.
	/// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
	/// c, is at index 0.
	unsigned getTransitionOriginalValueIdx() const {
	assert(isa<ExtractElementInst>(Transition) &&
	"Other kind of transitions are not supported yet");
	return 0;
	}

	/// \brief Return the index of the index in the transition.
	/// E.g., for "extractelement <2 x i32> c, i32 0" the index
	/// is at index 1.
	unsigned getTransitionIdx() const {
	assert(isa<ExtractElementInst>(Transition) &&
	"Other kind of transitions are not supported yet");
	return 1;
	}

	/// \brief Get the type of the transition.
	/// This is the type of the original value.
	/// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
	/// transition is <2 x i32>.
	Type *getTransitionType() const {
	return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
	}

	/// \brief Promote \p ToBePromoted by moving \p Def downward through.
	/// I.e., we have the following sequence:
	/// Def = Transition <ty1> a to <ty2>
	/// b = ToBePromoted <ty2> Def, ...
	/// =>
	/// b = ToBePromoted <ty1> a, ...
	/// Def = Transition <ty1> ToBePromoted to <ty2>
	void promoteImpl(Instruction *ToBePromoted);

	/// \brief Check whether or not it is profitable to promote all the
	/// instructions enqueued to be promoted.
	bool isProfitableToPromote() {
	Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
	unsigned Index = isa<ConstantInt>(ValIdx)
	? cast<ConstantInt>(ValIdx)->getZExtValue()
	: -1;
	Type *PromotedType = getTransitionType();

	StoreInst *ST = cast<StoreInst>(CombineInst);
	unsigned AS = ST->getPointerAddressSpace();
	unsigned Align = ST->getAlignment();
	// Check if this store is supported.
	if (!TLI.allowsMisalignedMemoryAccesses(
	TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
	Align)) {
	// If this is not supported, there is no way we can combine
	// the extract with the store.
	return false;
	}

	// The scalar chain of computation has to pay for the transition
	// scalar to vector.
	// The vector chain has to account for the combining cost.
	uint64_t ScalarCost =
	TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
	uint64_t VectorCost = StoreExtractCombineCost;
	for (const auto &Inst : InstsToBePromoted) {
	// Compute the cost.
	// By construction, all instructions being promoted are arithmetic ones.
	// Moreover, one argument is a constant that can be viewed as a splat
	// constant.
	Value *Arg0 = Inst->getOperand(0);
	bool IsArg0Constant = isa<UndefValue>(Arg0) \|\| isa<ConstantInt>(Arg0) \|\|
	isa<ConstantFP>(Arg0);
	TargetTransformInfo::OperandValueKind Arg0OVK =
	IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
	: TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Arg1OVK =
	!IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
	: TargetTransformInfo::OK_AnyValue;
	ScalarCost += TTI.getArithmeticInstrCost(
	Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
	VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
	Arg0OVK, Arg1OVK);
	}
	DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
	<< ScalarCost << "\nVector: " << VectorCost << '\n');
	return ScalarCost > VectorCost;
	}

	/// \brief Generate a constant vector with \p Val with the same
	/// number of elements as the transition.
	/// \p UseSplat defines whether or not \p Val should be replicated
	/// across the whole vector.
	/// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
	/// otherwise we generate a vector with as many undef as possible:
	/// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
	/// used at the index of the extract.
	Value getConstantVector(Constant Val, bool UseSplat) const {
	unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
	if (!UseSplat) {
	// If we cannot determine where the constant must be, we have to
	// use a splat constant.
	Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
	if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
	ExtractIdx = CstVal->getSExtValue();
	else
	UseSplat = true;
	}

	unsigned End = getTransitionType()->getVectorNumElements();
	if (UseSplat)
	return ConstantVector::getSplat(End, Val);

	SmallVector<Constant *, 4> ConstVec;
	UndefValue *UndefVal = UndefValue::get(Val->getType());
	for (unsigned Idx = 0; Idx != End; ++Idx) {
	if (Idx == ExtractIdx)
	ConstVec.push_back(Val);
	else
	ConstVec.push_back(UndefVal);
	}
	return ConstantVector::get(ConstVec);
	}

	/// \brief Check if promoting to a vector type an operand at \p OperandIdx
	/// in \p Use can trigger undefined behavior.
	static bool canCauseUndefinedBehavior(const Instruction *Use,
	unsigned OperandIdx) {
	// This is not safe to introduce undef when the operand is on
	// the right hand side of a division-like instruction.
	if (OperandIdx != 1)
	return false;
	switch (Use->getOpcode()) {
	default:
	return false;
	case Instruction::SDiv:
	case Instruction::UDiv:
	case Instruction::SRem:
	case Instruction::URem:
	return true;
	case Instruction::FDiv:
	case Instruction::FRem:
	return !Use->hasNoNaNs();
	}
	llvm_unreachable(nullptr);
	}

	public:
	VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
	const TargetTransformInfo &TTI, Instruction *Transition,
	unsigned CombineCost)
	: DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
	StoreExtractCombineCost(CombineCost) {
	assert(Transition && "Do not know how to promote null");
	}

	/// \brief Check if we can promote \p ToBePromoted to \p Type.
	bool canPromote(const Instruction *ToBePromoted) const {
	// We could support CastInst too.
	return isa<BinaryOperator>(ToBePromoted);
	}

	/// \brief Check if it is profitable to promote \p ToBePromoted
	/// by moving downward the transition through.
	bool shouldPromote(const Instruction *ToBePromoted) const {
	// Promote only if all the operands can be statically expanded.
	// Indeed, we do not want to introduce any new kind of transitions.
	for (const Use &U : ToBePromoted->operands()) {
	const Value *Val = U.get();
	if (Val == getEndOfTransition()) {
	// If the use is a division and the transition is on the rhs,
	// we cannot promote the operation, otherwise we may create a
	// division by zero.
	if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
	return false;
	continue;
	}
	if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
	!isa<ConstantFP>(Val))
	return false;
	}
	// Check that the resulting operation is legal.
	int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
	if (!ISDOpcode)
	return false;
	return StressStoreExtract \|\|
	TLI.isOperationLegalOrCustom(
	ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
	}

	/// \brief Check whether or not \p Use can be combined
	/// with the transition.
	/// I.e., is it possible to do Use(Transition) => AnotherUse?
	bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }

	/// \brief Record \p ToBePromoted as part of the chain to be promoted.
	void enqueueForPromotion(Instruction *ToBePromoted) {
	InstsToBePromoted.push_back(ToBePromoted);
	}

	/// \brief Set the instruction that will be combined with the transition.
	void recordCombineInstruction(Instruction *ToBeCombined) {
	assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
	CombineInst = ToBeCombined;
	}

	/// \brief Promote all the instructions enqueued for promotion if it is
	/// is profitable.
	/// \return True if the promotion happened, false otherwise.
	bool promote() {
	// Check if there is something to promote.
	// Right now, if we do not have anything to combine with,
	// we assume the promotion is not profitable.
	if (InstsToBePromoted.empty() \|\| !CombineInst)
	return false;

	// Check cost.
	if (!StressStoreExtract && !isProfitableToPromote())
	return false;

	// Promote.
	for (auto &ToBePromoted : InstsToBePromoted)
	promoteImpl(ToBePromoted);
	InstsToBePromoted.clear();
	return true;
	}
	};

	} // end anonymous namespace

	void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
	// At this point, we know that all the operands of ToBePromoted but Def
	// can be statically promoted.
	// For Def, we need to use its parameter in ToBePromoted:
	// b = ToBePromoted ty1 a
	// Def = Transition ty1 b to ty2
	// Move the transition down.
	// 1. Replace all uses of the promoted operation by the transition.
	// = ... b => = ... Def.
	assert(ToBePromoted->getType() == Transition->getType() &&
	"The type of the result of the transition does not match "
	"the final type");
	ToBePromoted->replaceAllUsesWith(Transition);
	// 2. Update the type of the uses.
	// b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
	Type *TransitionTy = getTransitionType();
	ToBePromoted->mutateType(TransitionTy);
	// 3. Update all the operands of the promoted operation with promoted
	// operands.
	// b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
	for (Use &U : ToBePromoted->operands()) {
	Value *Val = U.get();
	Value *NewVal = nullptr;
	if (Val == Transition)
	NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
	else if (isa<UndefValue>(Val) \|\| isa<ConstantInt>(Val) \|\|
	isa<ConstantFP>(Val)) {
	// Use a splat constant if it is not safe to use undef.
	NewVal = getConstantVector(
	cast<Constant>(Val),
	isa<UndefValue>(Val) \|\|
	canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
	} else
	llvm_unreachable("Did you modified shouldPromote and forgot to update "
	"this?");
	ToBePromoted->setOperand(U.getOperandNo(), NewVal);
	}
	Transition->moveAfter(ToBePromoted);
	Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
	}

	/// Some targets can do store(extractelement) with one instruction.
	/// Try to push the extractelement towards the stores when the target
	/// has this feature and this is profitable.
	bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
	unsigned CombineCost = std::numeric_limits<unsigned>::max();
	if (DisableStoreExtract \|\| !TLI \|\|
	(!StressStoreExtract &&
	!TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
	Inst->getOperand(1), CombineCost)))
	return false;

	// At this point we know that Inst is a vector to scalar transition.
	// Try to move it down the def-use chain, until:
	// - We can combine the transition with its single use
	// => we got rid of the transition.
	// - We escape the current basic block
	// => we would need to check that we are moving it at a cheaper place and
	// we do not do that for now.
	BasicBlock *Parent = Inst->getParent();
	DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
	VectorPromoteHelper VPH(DL, TLI, *TTI, Inst, CombineCost);
	// If the transition has more than one use, assume this is not going to be
	// beneficial.
	while (Inst->hasOneUse()) {
	Instruction ToBePromoted = cast<Instruction>(Inst->user_begin());
	DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');

	if (ToBePromoted->getParent() != Parent) {
	DEBUG(dbgs() << "Instruction to promote is in a different block ("
	<< ToBePromoted->getParent()->getName()
	<< ") than the transition (" << Parent->getName() << ").\n");
	return false;
	}

	if (VPH.canCombine(ToBePromoted)) {
	DEBUG(dbgs() << "Assume " << *Inst << '\n'
	<< "will be combined with: " << *ToBePromoted << '\n');
	VPH.recordCombineInstruction(ToBePromoted);
	bool Changed = VPH.promote();
	NumStoreExtractExposed += Changed;
	return Changed;
	}

	DEBUG(dbgs() << "Try promoting.\n");
	if (!VPH.canPromote(ToBePromoted) \|\| !VPH.shouldPromote(ToBePromoted))
	return false;

	DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");

	VPH.enqueueForPromotion(ToBePromoted);
	Inst = ToBePromoted;
	}
	return false;
	}

	/// For the instruction sequence of store below, F and I values
	/// are bundled together as an i64 value before being stored into memory.
	/// Sometimes it is more efficent to generate separate stores for F and I,
	/// which can remove the bitwise instructions or sink them to colder places.
	///
	/// (store (or (zext (bitcast F to i32) to i64),
	/// (shl (zext I to i64), 32)), addr) -->
	/// (store F, addr) and (store I, addr+4)
	///
	/// Similarly, splitting for other merged store can also be beneficial, like:
	/// For pair of {i32, i32}, i64 store --> two i32 stores.
	/// For pair of {i32, i16}, i64 store --> two i32 stores.
	/// For pair of {i16, i16}, i32 store --> two i16 stores.
	/// For pair of {i16, i8}, i32 store --> two i16 stores.
	/// For pair of {i8, i8}, i16 store --> two i8 stores.
	///
	/// We allow each target to determine specifically which kind of splitting is
	/// supported.
	///
	/// The store patterns are commonly seen from the simple code snippet below
	/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
	/// void goo(const std::pair<int, float> &);
	/// hoo() {
	/// ...
	/// goo(std::make_pair(tmp, ftmp));
	/// ...
	/// }
	///
	/// Although we already have similar splitting in DAG Combine, we duplicate
	/// it in CodeGenPrepare to catch the case in which pattern is across
	/// multiple BBs. The logic in DAG Combine is kept to catch case generated
	/// during code expansion.
	static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
	const TargetLowering &TLI) {
	// Handle simple but common cases only.
	Type *StoreType = SI.getValueOperand()->getType();
	if (DL.getTypeStoreSizeInBits(StoreType) != DL.getTypeSizeInBits(StoreType) \|\|
	DL.getTypeSizeInBits(StoreType) == 0)
	return false;

	unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
	Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
	if (DL.getTypeStoreSizeInBits(SplitStoreType) !=
	DL.getTypeSizeInBits(SplitStoreType))
	return false;

	// Match the following patterns:
	// (store (or (zext LValue to i64),
	// (shl (zext HValue to i64), 32)), HalfValBitSize)
	// or
	// (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
	// (zext LValue to i64),
	// Expect both operands of OR and the first operand of SHL have only
	// one use.
	Value LValue, HValue;
	if (!match(SI.getValueOperand(),
	m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
	m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
	m_SpecificInt(HalfValBitSize))))))
	return false;

	// Check LValue and HValue are int with size less or equal than 32.
	if (!LValue->getType()->isIntegerTy() \|\|
	DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize \|\|
	!HValue->getType()->isIntegerTy() \|\|
	DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
	return false;

	// If LValue/HValue is a bitcast instruction, use the EVT before bitcast
	// as the input of target query.
	auto *LBC = dyn_cast<BitCastInst>(LValue);
	auto *HBC = dyn_cast<BitCastInst>(HValue);
	EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
	: EVT::getEVT(LValue->getType());
	EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
	: EVT::getEVT(HValue->getType());
	if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
	return false;

	// Start to split store.
	IRBuilder<> Builder(SI.getContext());
	Builder.SetInsertPoint(&SI);

	// If LValue/HValue is a bitcast in another BB, create a new one in current
	// BB so it may be merged with the splitted stores by dag combiner.
	if (LBC && LBC->getParent() != SI.getParent())
	LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
	if (HBC && HBC->getParent() != SI.getParent())
	HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());

	auto CreateSplitStore = [&](Value *V, bool Upper) {
	V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
	Value *Addr = Builder.CreateBitCast(
	SI.getOperand(1),
	SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
	if (Upper)
	Addr = Builder.CreateGEP(
	SplitStoreType, Addr,
	ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
	Builder.CreateAlignedStore(
	V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
	};

	CreateSplitStore(LValue, false);
	CreateSplitStore(HValue, true);

	// Delete the old store.
	SI.eraseFromParent();
	return true;
	}

	// Return true if the GEP has two operands, the first operand is of a sequential
	// type, and the second operand is a constant.
	static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
	gep_type_iterator I = gep_type_begin(*GEP);
	return GEP->getNumOperands() == 2 &&
	I.isSequential() &&
	isa<ConstantInt>(GEP->getOperand(1));
	}

	// Try unmerging GEPs to reduce liveness interference (register pressure) across
	// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
	// reducing liveness interference across those edges benefits global register
	// allocation. Currently handles only certain cases.
	//
	// For example, unmerge %GEPI and %UGEPI as below.
	//
	// ---------- BEFORE ----------
	// SrcBlock:
	// ...
	// %GEPIOp = ...
	// ...
	// %GEPI = gep %GEPIOp, Idx
	// ...
	// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
	// (* %GEPI is alive on the indirectbr edges due to other uses ahead)
	// (* %GEPIOp is alive on the indirectbr edges only because of it's used by
	// %UGEPI)
	//
	// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
	// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
	// ...
	//
	// DstBi:
	// ...
	// %UGEPI = gep %GEPIOp, UIdx
	// ...
	// ---------------------------
	//
	// ---------- AFTER ----------
	// SrcBlock:
	// ... (same as above)
	// (* %GEPI is still alive on the indirectbr edges)
	// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
	// unmerging)
	// ...
	//
	// DstBi:
	// ...
	// %UGEPI = gep %GEPI, (UIdx-Idx)
	// ...
	// ---------------------------
	//
	// The register pressure on the IndirectBr edges is reduced because %GEPIOp is
	// no longer alive on them.
	//
	// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
	// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
	// not to disable further simplications and optimizations as a result of GEP
	// merging.
	//
	// Note this unmerging may increase the length of the data flow critical path
	// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
	// between the register pressure and the length of data-flow critical
	// path. Restricting this to the uncommon IndirectBr case would minimize the
	// impact of potentially longer critical path, if any, and the impact on compile
	// time.
	static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
	const TargetTransformInfo *TTI) {
	BasicBlock *SrcBlock = GEPI->getParent();
	// Check that SrcBlock ends with an IndirectBr. If not, give up. The common
	// (non-IndirectBr) cases exit early here.
	if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
	return false;
	// Check that GEPI is a simple gep with a single constant index.
	if (!GEPSequentialConstIndexed(GEPI))
	return false;
	ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
	// Check that GEPI is a cheap one.
	if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType())
	> TargetTransformInfo::TCC_Basic)
	return false;
	Value *GEPIOp = GEPI->getOperand(0);
	// Check that GEPIOp is an instruction that's also defined in SrcBlock.
	if (!isa<Instruction>(GEPIOp))
	return false;
	auto *GEPIOpI = cast<Instruction>(GEPIOp);
	if (GEPIOpI->getParent() != SrcBlock)
	return false;
	// Check that GEP is used outside the block, meaning it's alive on the
	// IndirectBr edge(s).
	if (find_if(GEPI->users(), [&](User *Usr) {
	if (auto *I = dyn_cast<Instruction>(Usr)) {
	if (I->getParent() != SrcBlock) {
	return true;
	}
	}
	return false;
	}) == GEPI->users().end())
	return false;
	// The second elements of the GEP chains to be unmerged.
	std::vector<GetElementPtrInst *> UGEPIs;
	// Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
	// on IndirectBr edges.
	for (User *Usr : GEPIOp->users()) {
	if (Usr == GEPI) continue;
	// Check if Usr is an Instruction. If not, give up.
	if (!isa<Instruction>(Usr))
	return false;
	auto *UI = cast<Instruction>(Usr);
	// Check if Usr in the same block as GEPIOp, which is fine, skip.
	if (UI->getParent() == SrcBlock)
	continue;
	// Check if Usr is a GEP. If not, give up.
	if (!isa<GetElementPtrInst>(Usr))
	return false;
	auto *UGEPI = cast<GetElementPtrInst>(Usr);
	// Check if UGEPI is a simple gep with a single constant index and GEPIOp is
	// the pointer operand to it. If so, record it in the vector. If not, give
	// up.
	if (!GEPSequentialConstIndexed(UGEPI))
	return false;
	if (UGEPI->getOperand(0) != GEPIOp)
	return false;
	if (GEPIIdx->getType() !=
	cast<ConstantInt>(UGEPI->getOperand(1))->getType())
	return false;
	ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
	if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType())
	> TargetTransformInfo::TCC_Basic)
	return false;
	UGEPIs.push_back(UGEPI);
	}
	if (UGEPIs.size() == 0)
	return false;
	// Check the materializing cost of (Uidx-Idx).
	for (GetElementPtrInst *UGEPI : UGEPIs) {
	ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
	APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
	unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType());
	if (ImmCost > TargetTransformInfo::TCC_Basic)
	return false;
	}
	// Now unmerge between GEPI and UGEPIs.
	for (GetElementPtrInst *UGEPI : UGEPIs) {
	UGEPI->setOperand(0, GEPI);
	ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
	Constant *NewUGEPIIdx =
	ConstantInt::get(GEPIIdx->getType(),
	UGEPIIdx->getValue() - GEPIIdx->getValue());
	UGEPI->setOperand(1, NewUGEPIIdx);
	// If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
	// inbounds to avoid UB.
	if (!GEPI->isInBounds()) {
	UGEPI->setIsInBounds(false);
	}
	}
	// After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
	// alive on IndirectBr edges).
	assert(find_if(GEPIOp->users(), [&](User *Usr) {
	return cast<Instruction>(Usr)->getParent() != SrcBlock;
	}) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
	return true;
	}

	bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
	// Bail out if we inserted the instruction to prevent optimizations from
	// stepping on each other's toes.
	if (InsertedInsts.count(I))
	return false;

	if (PHINode *P = dyn_cast<PHINode>(I)) {
	// It is possible for very late stage optimizations (such as SimplifyCFG)
	// to introduce PHI nodes too late to be cleaned up. If we detect such a
	// trivial PHI, go ahead and zap it here.
	if (Value V = SimplifyInstruction(P, {DL, TLInfo})) {
	P->replaceAllUsesWith(V);
	P->eraseFromParent();
	++NumPHIsElim;
	return true;
	}
	return false;
	}

	if (CastInst *CI = dyn_cast<CastInst>(I)) {
	// If the source of the cast is a constant, then this should have
	// already been constant folded. The only reason NOT to constant fold
	// it is if something (e.g. LSR) was careful to place the constant
	// evaluation in a block other than then one that uses it (e.g. to hoist
	// the address of globals out of a loop). If this is the case, we don't
	// want to forward-subst the cast.
	if (isa<Constant>(CI->getOperand(0)))
	return false;

	if (TLI && OptimizeNoopCopyExpression(CI, TLI, DL))
	return true;

	if (isa<ZExtInst>(I) \|\| isa<SExtInst>(I)) {
	/// Sink a zext or sext into its user blocks if the target type doesn't
	/// fit in one register
	if (TLI &&
	TLI->getTypeAction(CI->getContext(),
	TLI->getValueType(*DL, CI->getType())) ==
	TargetLowering::TypeExpandInteger) {
	return SinkCast(CI);
	} else {
	bool MadeChange = optimizeExt(I);
	return MadeChange \| optimizeExtUses(I);
	}
	}
	return false;
	}

	if (CmpInst *CI = dyn_cast<CmpInst>(I))
	if (!TLI \|\| !TLI->hasMultipleConditionRegisters())
	return OptimizeCmpExpression(CI, TLI);

	if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
	LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
	if (TLI) {
	bool Modified = optimizeLoadExt(LI);
	unsigned AS = LI->getPointerAddressSpace();
	Modified \|= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
	return Modified;
	}
	return false;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
	if (TLI && splitMergedValStore(SI, DL, *TLI))
	return true;
	SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
	if (TLI) {
	unsigned AS = SI->getPointerAddressSpace();
	return optimizeMemoryInst(I, SI->getOperand(1),
	SI->getOperand(0)->getType(), AS);
	}
	return false;
	}

	if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
	unsigned AS = RMW->getPointerAddressSpace();
	return optimizeMemoryInst(I, RMW->getPointerOperand(),
	RMW->getType(), AS);
	}

	if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
	unsigned AS = CmpX->getPointerAddressSpace();
	return optimizeMemoryInst(I, CmpX->getPointerOperand(),
	CmpX->getCompareOperand()->getType(), AS);
	}

	BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);

	if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
	EnableAndCmpSinking && TLI)
	return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);

	if (BinOp && (BinOp->getOpcode() == Instruction::AShr \|\|
	BinOp->getOpcode() == Instruction::LShr)) {
	ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
	if (TLI && CI && TLI->hasExtractBitsInsn())
	return OptimizeExtractBits(BinOp, CI, TLI, DL);

	return false;
	}

	if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
	if (GEPI->hasAllZeroIndices()) {
	/// The GEP operand must be a pointer, so must its result -> BitCast
	Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
	GEPI->getName(), GEPI);
	GEPI->replaceAllUsesWith(NC);
	GEPI->eraseFromParent();
	++NumGEPsElim;
	optimizeInst(NC, ModifiedDT);
	return true;
	}
	if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
	return true;
	}
	return false;
	}

	if (CallInst *CI = dyn_cast<CallInst>(I))
	return optimizeCallInst(CI, ModifiedDT);

	if (SelectInst *SI = dyn_cast<SelectInst>(I))
	return optimizeSelectInst(SI);

	if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
	return optimizeShuffleVectorInst(SVI);

	if (auto *Switch = dyn_cast<SwitchInst>(I))
	return optimizeSwitchInst(Switch);

	if (isa<ExtractElementInst>(I))
	return optimizeExtractElementInst(I);

	return false;
	}

	/// Given an OR instruction, check to see if this is a bitreverse
	/// idiom. If so, insert the new intrinsic and return true.
	static bool makeBitReverse(Instruction &I, const DataLayout &DL,
	const TargetLowering &TLI) {
	if (!I.getType()->isIntegerTy() \|\|
	!TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
	TLI.getValueType(DL, I.getType(), true)))
	return false;

	SmallVector<Instruction*, 4> Insts;
	if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
	return false;
	Instruction *LastInst = Insts.back();
	I.replaceAllUsesWith(LastInst);
	RecursivelyDeleteTriviallyDeadInstructions(&I);
	return true;
	}

	// In this pass we look for GEP and cast instructions that are used
	// across basic blocks and rewrite them to improve basic-block-at-a-time
	// selection.
	bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
	SunkAddrs.clear();
	bool MadeChange = false;

	CurInstIterator = BB.begin();
	while (CurInstIterator != BB.end()) {
	MadeChange \|= optimizeInst(&*CurInstIterator++, ModifiedDT);
	if (ModifiedDT)
	return true;
	}

	bool MadeBitReverse = true;
	while (TLI && MadeBitReverse) {
	MadeBitReverse = false;
	for (auto &I : reverse(BB)) {
	if (makeBitReverse(I, DL, TLI)) {
	MadeBitReverse = MadeChange = true;
	ModifiedDT = true;
	break;
	}
	}
	}
	MadeChange \|= dupRetToEnableTailCallOpts(&BB);

	return MadeChange;
	}

	// llvm.dbg.value is far away from the value then iSel may not be able
	// handle it properly. iSel will drop llvm.dbg.value if it can not
	// find a node corresponding to the value.
	bool CodeGenPrepare::placeDbgValues(Function &F) {
	bool MadeChange = false;
	for (BasicBlock &BB : F) {
	Instruction *PrevNonDbgInst = nullptr;
	for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
	Instruction Insn = &BI++;
	DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
	// Leave dbg.values that refer to an alloca alone. These
	// intrinsics describe the address of a variable (= the alloca)
	// being taken. They should not be moved next to the alloca
	// (and to the beginning of the scope), but rather stay close to
	// where said address is used.
	if (!DVI \|\| (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
	PrevNonDbgInst = Insn;
	continue;
	}

	Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
	if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
	// If VI is a phi in a block with an EHPad terminator, we can't insert
	// after it.
	if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
	continue;
	DEBUG(dbgs() << "Moving Debug Value before :\n" << DVI << ' ' << VI);
	DVI->removeFromParent();
	if (isa<PHINode>(VI))
	DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
	else
	DVI->insertAfter(VI);
	MadeChange = true;
	++NumDbgValueMoved;
	}
	}
	}
	return MadeChange;
	}

	/// \brief Scale down both weights to fit into uint32_t.
	static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
	uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
	uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
	NewTrue = NewTrue / Scale;
	NewFalse = NewFalse / Scale;
	}

	/// \brief Some targets prefer to split a conditional branch like:
	/// \code
	/// %0 = icmp ne i32 %a, 0
	/// %1 = icmp ne i32 %b, 0
	/// %or.cond = or i1 %0, %1
	/// br i1 %or.cond, label %TrueBB, label %FalseBB
	/// \endcode
	/// into multiple branch instructions like:
	/// \code
	/// bb1:
	/// %0 = icmp ne i32 %a, 0
	/// br i1 %0, label %TrueBB, label %bb2
	/// bb2:
	/// %1 = icmp ne i32 %b, 0
	/// br i1 %1, label %TrueBB, label %FalseBB
	/// \endcode
	/// This usually allows instruction selection to do even further optimizations
	/// and combine the compare with the branch instruction. Currently this is
	/// applied for targets which have "cheap" jump instructions.
	///
	/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
	///
	bool CodeGenPrepare::splitBranchCondition(Function &F) {
	if (!TM \|\| !TM->Options.EnableFastISel \|\| !TLI \|\| TLI->isJumpExpensive())
	return false;

	bool MadeChange = false;
	for (auto &BB : F) {
	// Does this BB end with the following?
	// %cond1 = icmp\|fcmp\|binary instruction ...
	// %cond2 = icmp\|fcmp\|binary instruction ...
	// %cond.or = or\|and i1 %cond1, cond2
	// br i1 %cond.or label %dest1, label %dest2"
	BinaryOperator *LogicOp;
	BasicBlock TBB, FBB;
	if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
	continue;

	auto *Br1 = cast<BranchInst>(BB.getTerminator());
	if (Br1->getMetadata(LLVMContext::MD_unpredictable))
	continue;

	unsigned Opc;
	Value Cond1, Cond2;
	if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
	m_OneUse(m_Value(Cond2)))))
	Opc = Instruction::And;
	else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
	m_OneUse(m_Value(Cond2)))))
	Opc = Instruction::Or;
	else
	continue;

	if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) \|\|
	!match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) )
	continue;

	DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());

	// Create a new BB.
	auto TmpBB =
	BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
	BB.getParent(), BB.getNextNode());

	// Update original basic block by using the first condition directly by the
	// branch instruction and removing the no longer needed and/or instruction.
	Br1->setCondition(Cond1);
	LogicOp->eraseFromParent();

	// Depending on the conditon we have to either replace the true or the false
	// successor of the original branch instruction.
	if (Opc == Instruction::And)
	Br1->setSuccessor(0, TmpBB);
	else
	Br1->setSuccessor(1, TmpBB);

	// Fill in the new basic block.
	auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
	if (auto *I = dyn_cast<Instruction>(Cond2)) {
	I->removeFromParent();
	I->insertBefore(Br2);
	}

	// Update PHI nodes in both successors. The original BB needs to be
	// replaced in one successor's PHI nodes, because the branch comes now from
	// the newly generated BB (NewBB). In the other successor we need to add one
	// incoming edge to the PHI nodes, because both branch instructions target
	// now the same successor. Depending on the original branch condition
	// (and/or) we have to swap the successors (TrueDest, FalseDest), so that
	// we perform the correct update for the PHI nodes.
	// This doesn't change the successor order of the just created branch
	// instruction (or any other instruction).
	if (Opc == Instruction::Or)
	std::swap(TBB, FBB);

	// Replace the old BB with the new BB.
	for (PHINode &PN : TBB->phis()) {
	int i;
	while ((i = PN.getBasicBlockIndex(&BB)) >= 0)
	PN.setIncomingBlock(i, TmpBB);
	}

	// Add another incoming edge form the new BB.
	for (PHINode &PN : FBB->phis()) {
	auto *Val = PN.getIncomingValueForBlock(&BB);
	PN.addIncoming(Val, TmpBB);
	}

	// Update the branch weights (from SelectionDAGBuilder::
	// FindMergedConditions).
	if (Opc == Instruction::Or) {
	// Codegen X \| Y as:
	// BB1:
	// jmp_if_X TBB
	// jmp TmpBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//

	// We have flexibility in setting Prob for BB1 and Prob for NewBB.
	// The requirement is that
	// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
	// = TrueProb for orignal BB.
	// Assuming the orignal weights are A and B, one choice is to set BB1's
	// weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
	// assumes that
	// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
	// Another choice is to assume TrueProb for BB1 equals to TrueProb for
	// TmpBB, but the math is more complicated.
	uint64_t TrueWeight, FalseWeight;
	if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
	uint64_t NewTrueWeight = TrueWeight;
	uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));

	NewTrueWeight = TrueWeight;
	NewFalseWeight = 2 * FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));
	}
	} else {
	// Codegen X & Y as:
	// BB1:
	// jmp_if_X TmpBB
	// jmp FBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//
	// This requires creation of TmpBB after CurBB.

	// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
	// The requirement is that
	// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
	// = FalseProb for orignal BB.
	// Assuming the orignal weights are A and B, one choice is to set BB1's
	// weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
	// assumes that
	// FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
	uint64_t TrueWeight, FalseWeight;
	if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
	uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
	uint64_t NewFalseWeight = FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));

	NewTrueWeight = 2 * TrueWeight;
	NewFalseWeight = FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));
	}
	}

	// Note: No point in getting fancy here, since the DT info is never
	// available to CodeGenPrepare.
	ModifiedDT = true;

	MadeChange = true;

	DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
	TmpBB->dump());
	}
	return MadeChange;
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/GlobalMerge.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/GlobalMerge.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/GlobalMerge.cpp (revision 328362)
	@@ -1,653 +1,654 @@
	//===- GlobalMerge.cpp - Internal globals merging -------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass merges globals with internal linkage into one. This way all the
	// globals which were merged into a biggest one can be addressed using offsets
	// from the same base pointer (no need for separate base pointer for each of the
	// global). Such a transformation can significantly reduce the register pressure
	// when many globals are involved.
	//
	// For example, consider the code which touches several global variables at
	// once:
	//
	// static int foo[N], bar[N], baz[N];
	//
	// for (i = 0; i < N; ++i) {
	// foo[i] = bar[i] * baz[i];
	// }
	//
	// On ARM the addresses of 3 arrays should be kept in the registers, thus
	// this code has quite large register pressure (loop body):
	//
	// ldr r1, [r5], #4
	// ldr r2, [r6], #4
	// mul r1, r2, r1
	// str r1, [r0], #4
	//
	// Pass converts the code to something like:
	//
	// static struct {
	// int foo[N];
	// int bar[N];
	// int baz[N];
	// } merged;
	//
	// for (i = 0; i < N; ++i) {
	// merged.foo[i] = merged.bar[i] * merged.baz[i];
	// }
	//
	// and in ARM code this becomes:
	//
	// ldr r0, [r5, #40]
	// ldr r1, [r5, #80]
	// mul r0, r1, r0
	// str r0, [r5], #4
	//
	// note that we saved 2 registers here almostly "for free".
	//
	// However, merging globals can have tradeoffs:
	// - it confuses debuggers, tools, and users
	// - it makes linker optimizations less useful (order files, LOHs, ...)
	// - it forces usage of indexed addressing (which isn't necessarily "free")
	// - it can increase register pressure when the uses are disparate enough.
	//
	// We use heuristics to discover the best global grouping we can (cf cl::opts).
	//
	// ===---------------------------------------------------------------------===//

	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetLoweringObjectFile.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <string>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "global-merge"

	// FIXME: This is only useful as a last-resort way to disable the pass.
	static cl::opt<bool>
	EnableGlobalMerge("enable-global-merge", cl::Hidden,
	cl::desc("Enable the global merge pass"),
	cl::init(true));

	static cl::opt<unsigned>
	GlobalMergeMaxOffset("global-merge-max-offset", cl::Hidden,
	cl::desc("Set maximum offset for global merge pass"),
	cl::init(0));

	static cl::opt<bool> GlobalMergeGroupByUse(
	"global-merge-group-by-use", cl::Hidden,
	cl::desc("Improve global merge pass to look at uses"), cl::init(true));

	static cl::opt<bool> GlobalMergeIgnoreSingleUse(
	"global-merge-ignore-single-use", cl::Hidden,
	cl::desc("Improve global merge pass to ignore globals only used alone"),
	cl::init(true));

	static cl::opt<bool>
	EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
	cl::desc("Enable global merge pass on constants"),
	cl::init(false));

	// FIXME: this could be a transitional option, and we probably need to remove
	// it if only we are sure this optimization could always benefit all targets.
	static cl::opt<cl::boolOrDefault>
	EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
	cl::desc("Enable global merge pass on external linkage"));

	STATISTIC(NumMerged, "Number of globals merged");

	namespace {

	class GlobalMerge : public FunctionPass {
	const TargetMachine *TM = nullptr;

	// FIXME: Infer the maximum possible offset depending on the actual users
	// (these max offsets are different for the users inside Thumb or ARM
	// functions), see the code that passes in the offset in the ARM backend
	// for more information.
	unsigned MaxOffset;

	/// Whether we should try to optimize for size only.
	/// Currently, this applies a dead simple heuristic: only consider globals
	/// used in minsize functions for merging.
	/// FIXME: This could learn about optsize, and be used in the cost model.
	bool OnlyOptimizeForSize = false;

	/// Whether we should merge global variables that have external linkage.
	bool MergeExternalGlobals = false;

	bool IsMachO;

	bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
	Module &M, bool isConst, unsigned AddrSpace) const;

	/// \brief Merge everything in \p Globals for which the corresponding bit
	/// in \p GlobalSet is set.
	bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
	const BitVector &GlobalSet, Module &M, bool isConst,
	unsigned AddrSpace) const;

	/// \brief Check if the given variable has been identified as must keep
	/// \pre setMustKeepGlobalVariables must have been called on the Module that
	/// contains GV
	bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
	return MustKeepGlobalVariables.count(GV);
	}

	/// Collect every variables marked as "used" or used in a landing pad
	/// instruction for this Module.
	void setMustKeepGlobalVariables(Module &M);

	/// Collect every variables marked as "used"
	void collectUsedGlobalVariables(Module &M);

	/// Keep track of the GlobalVariable that must not be merged away
	SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;

	public:
	static char ID; // Pass identification, replacement for typeid.

	explicit GlobalMerge()
	: FunctionPass(ID), MaxOffset(GlobalMergeMaxOffset) {
	initializeGlobalMergePass(*PassRegistry::getPassRegistry());
	}

	explicit GlobalMerge(const TargetMachine *TM, unsigned MaximalOffset,
	bool OnlyOptimizeForSize, bool MergeExternalGlobals)
	: FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset),
	OnlyOptimizeForSize(OnlyOptimizeForSize),
	MergeExternalGlobals(MergeExternalGlobals) {
	initializeGlobalMergePass(*PassRegistry::getPassRegistry());
	}

	bool doInitialization(Module &M) override;
	bool runOnFunction(Function &F) override;
	bool doFinalization(Module &M) override;

	StringRef getPassName() const override { return "Merge internal globals"; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	FunctionPass::getAnalysisUsage(AU);
	}
	};

	} // end anonymous namespace

	char GlobalMerge::ID = 0;

	INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false)

	bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
	Module &M, bool isConst, unsigned AddrSpace) const {
	auto &DL = M.getDataLayout();
	// FIXME: Find better heuristics
	std::stable_sort(Globals.begin(), Globals.end(),
	[&DL](const GlobalVariable GV1, const GlobalVariable GV2) {
	return DL.getTypeAllocSize(GV1->getValueType()) <
	DL.getTypeAllocSize(GV2->getValueType());
	});

	// If we want to just blindly group all globals together, do so.
	if (!GlobalMergeGroupByUse) {
	BitVector AllGlobals(Globals.size());
	AllGlobals.set();
	return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
	}

	// If we want to be smarter, look at all uses of each global, to try to
	// discover all sets of globals used together, and how many times each of
	// these sets occurred.
	//
	// Keep this reasonably efficient, by having an append-only list of all sets
	// discovered so far (UsedGlobalSet), and mapping each "together-ness" unit of
	// code (currently, a Function) to the set of globals seen so far that are
	// used together in that unit (GlobalUsesByFunction).
	//
	// When we look at the Nth global, we now that any new set is either:
	// - the singleton set {N}, containing this global only, or
	// - the union of {N} and a previously-discovered set, containing some
	// combination of the previous N-1 globals.
	// Using that knowledge, when looking at the Nth global, we can keep:
	// - a reference to the singleton set {N} (CurGVOnlySetIdx)
	// - a list mapping each previous set to its union with {N} (EncounteredUGS),
	// if it actually occurs.

	// We keep track of the sets of globals used together "close enough".
	struct UsedGlobalSet {
	BitVector Globals;
	unsigned UsageCount = 1;

	UsedGlobalSet(size_t Size) : Globals(Size) {}
	};

	// Each set is unique in UsedGlobalSets.
	std::vector<UsedGlobalSet> UsedGlobalSets;

	// Avoid repeating the create-global-set pattern.
	auto CreateGlobalSet = [&]() -> UsedGlobalSet & {
	UsedGlobalSets.emplace_back(Globals.size());
	return UsedGlobalSets.back();
	};

	// The first set is the empty set.
	CreateGlobalSet().UsageCount = 0;

	// We define "close enough" to be "in the same function".
	// FIXME: Grouping uses by function is way too aggressive, so we should have
	// a better metric for distance between uses.
	// The obvious alternative would be to group by BasicBlock, but that's in
	// turn too conservative..
	// Anything in between wouldn't be trivial to compute, so just stick with
	// per-function grouping.

	// The value type is an index into UsedGlobalSets.
	// The default (0) conveniently points to the empty set.
	DenseMap<Function , size_t /UsedGlobalSetIdx*/> GlobalUsesByFunction;

	// Now, look at each merge-eligible global in turn.

	// Keep track of the sets we already encountered to which we added the
	// current global.
	// Each element matches the same-index element in UsedGlobalSets.
	// This lets us efficiently tell whether a set has already been expanded to
	// include the current global.
	std::vector<size_t> EncounteredUGS;

	for (size_t GI = 0, GE = Globals.size(); GI != GE; ++GI) {
	GlobalVariable *GV = Globals[GI];

	// Reset the encountered sets for this global...
	std::fill(EncounteredUGS.begin(), EncounteredUGS.end(), 0);
	// ...and grow it in case we created new sets for the previous global.
	EncounteredUGS.resize(UsedGlobalSets.size());

	// We might need to create a set that only consists of the current global.
	// Keep track of its index into UsedGlobalSets.
	size_t CurGVOnlySetIdx = 0;

	// For each global, look at all its Uses.
	for (auto &U : GV->uses()) {
	// This Use might be a ConstantExpr. We're interested in Instruction
	// users, so look through ConstantExpr...
	Use UI, UE;
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
	if (CE->use_empty())
	continue;
	UI = &*CE->use_begin();
	UE = nullptr;
	} else if (isa<Instruction>(U.getUser())) {
	UI = &U;
	UE = UI->getNext();
	} else {
	continue;
	}

	// ...to iterate on all the instruction users of the global.
	// Note that we iterate on Uses and not on Users to be able to getNext().
	for (; UI != UE; UI = UI->getNext()) {
	Instruction *I = dyn_cast<Instruction>(UI->getUser());
	if (!I)
	continue;

	Function *ParentFn = I->getParent()->getParent();

	// If we're only optimizing for size, ignore non-minsize functions.
	if (OnlyOptimizeForSize && !ParentFn->optForMinSize())
	continue;

	size_t UGSIdx = GlobalUsesByFunction[ParentFn];

	// If this is the first global the basic block uses, map it to the set
	// consisting of this global only.
	if (!UGSIdx) {
	// If that set doesn't exist yet, create it.
	if (!CurGVOnlySetIdx) {
	CurGVOnlySetIdx = UsedGlobalSets.size();
	CreateGlobalSet().Globals.set(GI);
	} else {
	++UsedGlobalSets[CurGVOnlySetIdx].UsageCount;
	}

	GlobalUsesByFunction[ParentFn] = CurGVOnlySetIdx;
	continue;
	}

	// If we already encountered this BB, just increment the counter.
	if (UsedGlobalSets[UGSIdx].Globals.test(GI)) {
	++UsedGlobalSets[UGSIdx].UsageCount;
	continue;
	}

	// If not, the previous set wasn't actually used in this function.
	--UsedGlobalSets[UGSIdx].UsageCount;

	// If we already expanded the previous set to include this global, just
	// reuse that expanded set.
	if (size_t ExpandedIdx = EncounteredUGS[UGSIdx]) {
	++UsedGlobalSets[ExpandedIdx].UsageCount;
	GlobalUsesByFunction[ParentFn] = ExpandedIdx;
	continue;
	}

	// If not, create a new set consisting of the union of the previous set
	// and this global. Mark it as encountered, so we can reuse it later.
	GlobalUsesByFunction[ParentFn] = EncounteredUGS[UGSIdx] =
	UsedGlobalSets.size();

	UsedGlobalSet &NewUGS = CreateGlobalSet();
	NewUGS.Globals.set(GI);
	NewUGS.Globals \|= UsedGlobalSets[UGSIdx].Globals;
	}
	}
	}

	// Now we found a bunch of sets of globals used together. We accumulated
	// the number of times we encountered the sets (i.e., the number of blocks
	// that use that exact set of globals).
	//
	// Multiply that by the size of the set to give us a crude profitability
	// metric.
	std::stable_sort(UsedGlobalSets.begin(), UsedGlobalSets.end(),
	[](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
	return UGS1.Globals.count() * UGS1.UsageCount <
	UGS2.Globals.count() * UGS2.UsageCount;
	});

	// We can choose to merge all globals together, but ignore globals never used
	// with another global. This catches the obviously non-profitable cases of
	// having a single global, but is aggressive enough for any other case.
	if (GlobalMergeIgnoreSingleUse) {
	BitVector AllGlobals(Globals.size());
	for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) {
	const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1];
	if (UGS.UsageCount == 0)
	continue;
	if (UGS.Globals.count() > 1)
	AllGlobals \|= UGS.Globals;
	}
	return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
	}

	// Starting from the sets with the best (=biggest) profitability, find a
	// good combination.
	// The ideal (and expensive) solution can only be found by trying all
	// combinations, looking for the one with the best profitability.
	// Don't be smart about it, and just pick the first compatible combination,
	// starting with the sets with the best profitability.
	BitVector PickedGlobals(Globals.size());
	bool Changed = false;

	for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) {
	const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1];
	if (UGS.UsageCount == 0)
	continue;
	if (PickedGlobals.anyCommon(UGS.Globals))
	continue;
	PickedGlobals \|= UGS.Globals;
	// If the set only contains one global, there's no point in merging.
	// Ignore the global for inclusion in other sets though, so keep it in
	// PickedGlobals.
	if (UGS.Globals.count() < 2)
	continue;
	Changed \|= doMerge(Globals, UGS.Globals, M, isConst, AddrSpace);
	}

	return Changed;
	}

	bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
	const BitVector &GlobalSet, Module &M, bool isConst,
	unsigned AddrSpace) const {
	assert(Globals.size() > 1);

	Type *Int32Ty = Type::getInt32Ty(M.getContext());
	auto &DL = M.getDataLayout();

	DEBUG(dbgs() << " Trying to merge set, starts with #"
	<< GlobalSet.find_first() << "\n");

	ssize_t i = GlobalSet.find_first();
	while (i != -1) {
	ssize_t j = 0;
	uint64_t MergedSize = 0;
	std::vector<Type*> Tys;
	std::vector<Constant*> Inits;

	bool HasExternal = false;
	StringRef FirstExternalName;
	for (j = i; j != -1; j = GlobalSet.find_next(j)) {
	Type *Ty = Globals[j]->getValueType();
	MergedSize += DL.getTypeAllocSize(Ty);
	if (MergedSize > MaxOffset) {
	break;
	}
	Tys.push_back(Ty);
	Inits.push_back(Globals[j]->getInitializer());

	if (Globals[j]->hasExternalLinkage() && !HasExternal) {
	HasExternal = true;
	FirstExternalName = Globals[j]->getName();
	}
	}

	// If merged variables doesn't have external linkage, we needn't to expose
	// the symbol after merging.
	GlobalValue::LinkageTypes Linkage = HasExternal
	? GlobalValue::ExternalLinkage
	: GlobalValue::InternalLinkage;
	StructType *MergedTy = StructType::get(M.getContext(), Tys);
	Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);

	// On Darwin external linkage needs to be preserved, otherwise
	// dsymutil cannot preserve the debug info for the merged
	// variables. If they have external linkage, use the symbol name
	// of the first variable merged as the suffix of global symbol
	// name. This avoids a link-time naming conflict for the
	// _MergedGlobals symbols.
	Twine MergedName =
	(IsMachO && HasExternal)
	? "_MergedGlobals_" + FirstExternalName
	: "_MergedGlobals";
	auto MergedLinkage = IsMachO ? Linkage : GlobalValue::PrivateLinkage;
	auto *MergedGV = new GlobalVariable(
	M, MergedTy, isConst, MergedLinkage, MergedInit, MergedName, nullptr,
	GlobalVariable::NotThreadLocal, AddrSpace);

	const StructLayout *MergedLayout = DL.getStructLayout(MergedTy);

	for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) {
	GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
	std::string Name = Globals[k]->getName();

	// Copy metadata while adjusting any debug info metadata by the original
	// global's offset within the merged global.
	MergedGV->copyMetadata(Globals[k], MergedLayout->getElementOffset(idx));

	Constant *Idx[2] = {
	ConstantInt::get(Int32Ty, 0),
	ConstantInt::get(Int32Ty, idx),
	};
	Constant *GEP =
	ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx);
	Globals[k]->replaceAllUsesWith(GEP);
	Globals[k]->eraseFromParent();

	// When the linkage is not internal we must emit an alias for the original
	// variable name as it may be accessed from another object. On non-Mach-O
	// we can also emit an alias for internal linkage as it's safe to do so.
	// It's not safe on Mach-O as the alias (and thus the portion of the
	// MergedGlobals variable) may be dead stripped at link time.
	if (Linkage != GlobalValue::InternalLinkage \|\| !IsMachO) {
	GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M);
	}

	NumMerged++;
	}
	i = j;
	}

	return true;
	}

	void GlobalMerge::collectUsedGlobalVariables(Module &M) {
	// Extract global variables from llvm.used array
	const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
	if (!GV \|\| !GV->hasInitializer()) return;

	// Should be an array of 'i8*'.
	const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer());

	for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
	if (const GlobalVariable *G =
	dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
	MustKeepGlobalVariables.insert(G);
	}

	void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
	collectUsedGlobalVariables(M);

	for (Function &F : M) {
	for (BasicBlock &BB : F) {
	Instruction *Pad = BB.getFirstNonPHI();
	if (!Pad->isEHPad())
	continue;

	// Keep globals used by landingpads and catchpads.
	for (const Use &U : Pad->operands()) {
	if (const GlobalVariable *GV =
	dyn_cast<GlobalVariable>(U->stripPointerCasts()))
	MustKeepGlobalVariables.insert(GV);
	}
	}
	}
	}

	bool GlobalMerge::doInitialization(Module &M) {
	if (!EnableGlobalMerge)
	return false;

	IsMachO = Triple(M.getTargetTriple()).isOSBinFormatMachO();

	auto &DL = M.getDataLayout();
	DenseMap<unsigned, SmallVector<GlobalVariable *, 16>> Globals, ConstGlobals,
	BSSGlobals;
	bool Changed = false;
	setMustKeepGlobalVariables(M);

	// Grab all non-const globals.
	for (auto &GV : M.globals()) {
	// Merge is safe for "normal" internal or external globals only
	if (GV.isDeclaration() \|\| GV.isThreadLocal() \|\|
	- GV.hasSection() \|\| GV.hasImplicitSection())
	+ GV.hasSection() \|\| GV.hasImplicitSection() \|\|
	+ GV.hasDLLExportStorageClass())
	continue;

	// It's not safe to merge globals that may be preempted
	if (TM && !TM->shouldAssumeDSOLocal(M, &GV))
	continue;

	if (!(MergeExternalGlobals && GV.hasExternalLinkage()) &&
	!GV.hasInternalLinkage())
	continue;

	PointerType *PT = dyn_cast<PointerType>(GV.getType());
	assert(PT && "Global variable is not a pointer!");

	unsigned AddressSpace = PT->getAddressSpace();

	// Ignore fancy-aligned globals for now.
	unsigned Alignment = DL.getPreferredAlignment(&GV);
	Type *Ty = GV.getValueType();
	if (Alignment > DL.getABITypeAlignment(Ty))
	continue;

	// Ignore all 'special' globals.
	if (GV.getName().startswith("llvm.") \|\|
	GV.getName().startswith(".llvm."))
	continue;

	// Ignore all "required" globals:
	if (isMustKeepGlobalVariable(&GV))
	continue;

	if (DL.getTypeAllocSize(Ty) < MaxOffset) {
	if (TM &&
	TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal())
	BSSGlobals[AddressSpace].push_back(&GV);
	else if (GV.isConstant())
	ConstGlobals[AddressSpace].push_back(&GV);
	else
	Globals[AddressSpace].push_back(&GV);
	}
	}

	for (auto &P : Globals)
	if (P.second.size() > 1)
	Changed \|= doMerge(P.second, M, false, P.first);

	for (auto &P : BSSGlobals)
	if (P.second.size() > 1)
	Changed \|= doMerge(P.second, M, false, P.first);

	if (EnableGlobalMergeOnConst)
	for (auto &P : ConstGlobals)
	if (P.second.size() > 1)
	Changed \|= doMerge(P.second, M, true, P.first);

	return Changed;
	}

	bool GlobalMerge::runOnFunction(Function &F) {
	return false;
	}

	bool GlobalMerge::doFinalization(Module &M) {
	MustKeepGlobalVariables.clear();
	return false;
	}

	Pass llvm::createGlobalMergePass(const TargetMachine TM, unsigned Offset,
	bool OnlyOptimizeForSize,
	bool MergeExternalByDefault) {
	bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ?
	MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE);
	return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal);
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/PeepholeOptimizer.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/PeepholeOptimizer.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/PeepholeOptimizer.cpp (revision 328362)
	@@ -1,2166 +1,2163 @@
	//===- PeepholeOptimizer.cpp - Peephole Optimizations ---------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Perform peephole optimizations on the machine code:
	//
	// - Optimize Extensions
	//
	// Optimization of sign / zero extension instructions. It may be extended to
	// handle other instructions with similar properties.
	//
	// On some targets, some instructions, e.g. X86 sign / zero extension, may
	// leave the source value in the lower part of the result. This optimization
	// will replace some uses of the pre-extension value with uses of the
	// sub-register of the results.
	//
	// - Optimize Comparisons
	//
	// Optimization of comparison instructions. For instance, in this code:
	//
	// sub r1, 1
	// cmp r1, 0
	// bz L1
	//
	// If the "sub" instruction all ready sets (or could be modified to set) the
	// same flag that the "cmp" instruction sets and that "bz" uses, then we can
	// eliminate the "cmp" instruction.
	//
	// Another instance, in this code:
	//
	// sub r1, r3 \| sub r1, imm
	// cmp r3, r1 or cmp r1, r3 \| cmp r1, imm
	// bge L1
	//
	// If the branch instruction can use flag from "sub", then we can replace
	// "sub" with "subs" and eliminate the "cmp" instruction.
	//
	// - Optimize Loads:
	//
	// Loads that can be folded into a later instruction. A load is foldable
	// if it loads to virtual registers and the virtual register defined has
	// a single use.
	//
	// - Optimize Copies and Bitcast (more generally, target specific copies):
	//
	// Rewrite copies and bitcasts to avoid cross register bank copies
	// when possible.
	// E.g., Consider the following example, where capital and lower
	// letters denote different register file:
	// b = copy A <-- cross-bank copy
	// C = copy b <-- cross-bank copy
	// =>
	// b = copy A <-- cross-bank copy
	// C = copy A <-- same-bank copy
	//
	// E.g., for bitcast:
	// b = bitcast A <-- cross-bank copy
	// C = bitcast b <-- cross-bank copy
	// =>
	// b = bitcast A <-- cross-bank copy
	// C = copy A <-- same-bank copy
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/MC/LaneBitmask.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cstdint>
	#include <memory>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "peephole-opt"

	// Optimize Extensions
	static cl::opt<bool>
	Aggressive("aggressive-ext-opt", cl::Hidden,
	cl::desc("Aggressive extension optimization"));

	static cl::opt<bool>
	DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
	cl::desc("Disable the peephole optimizer"));

	static cl::opt<bool>
	DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
	cl::desc("Disable advanced copy optimization"));

	static cl::opt<bool> DisableNAPhysCopyOpt(
	"disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
	cl::desc("Disable non-allocatable physical register copy optimization"));

	// Limit the number of PHI instructions to process
	// in PeepholeOptimizer::getNextSource.
	static cl::opt<unsigned> RewritePHILimit(
	"rewrite-phi-limit", cl::Hidden, cl::init(10),
	cl::desc("Limit the length of PHI chains to lookup"));

	// Limit the length of recurrence chain when evaluating the benefit of
	// commuting operands.
	static cl::opt<unsigned> MaxRecurrenceChain(
	"recurrence-chain-limit", cl::Hidden, cl::init(3),
	cl::desc("Maximum length of recurrence chain when evaluating the benefit "
	"of commuting operands"));


	STATISTIC(NumReuse, "Number of extension results reused");
	STATISTIC(NumCmps, "Number of compares eliminated");
	STATISTIC(NumImmFold, "Number of move immediate folded");
	STATISTIC(NumLoadFold, "Number of loads folded");
	STATISTIC(NumSelects, "Number of selects optimized");
	STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
	STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
	STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");

	namespace {

	class ValueTrackerResult;
	class RecurrenceInstr;

	class PeepholeOptimizer : public MachineFunctionPass {
	const TargetInstrInfo *TII;
	const TargetRegisterInfo *TRI;
	MachineRegisterInfo *MRI;
	MachineDominatorTree *DT; // Machine dominator tree
	MachineLoopInfo *MLI;

	public:
	static char ID; // Pass identification

	PeepholeOptimizer() : MachineFunctionPass(ID) {
	initializePeepholeOptimizerPass(*PassRegistry::getPassRegistry());
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	AU.addRequired<MachineLoopInfo>();
	AU.addPreserved<MachineLoopInfo>();
	if (Aggressive) {
	AU.addRequired<MachineDominatorTree>();
	AU.addPreserved<MachineDominatorTree>();
	}
	}

	/// \brief Track Def -> Use info used for rewriting copies.
	using RewriteMapTy =
	SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>;

	/// \brief Sequence of instructions that formulate recurrence cycle.
	using RecurrenceCycle = SmallVector<RecurrenceInstr, 4>;

	private:
	bool optimizeCmpInstr(MachineInstr MI, MachineBasicBlock MBB);
	bool optimizeExtInstr(MachineInstr MI, MachineBasicBlock MBB,
	SmallPtrSetImpl<MachineInstr*> &LocalMIs);
	bool optimizeSelect(MachineInstr *MI,
	SmallPtrSetImpl<MachineInstr *> &LocalMIs);
	bool optimizeCondBranch(MachineInstr *MI);
	bool optimizeCoalescableCopy(MachineInstr *MI);
	bool optimizeUncoalescableCopy(MachineInstr *MI,
	SmallPtrSetImpl<MachineInstr *> &LocalMIs);
	bool optimizeRecurrence(MachineInstr &PHI);
	bool findNextSource(unsigned Reg, unsigned SubReg,
	RewriteMapTy &RewriteMap);
	bool isMoveImmediate(MachineInstr *MI,
	SmallSet<unsigned, 4> &ImmDefRegs,
	DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
	bool foldImmediate(MachineInstr MI, MachineBasicBlock MBB,
	SmallSet<unsigned, 4> &ImmDefRegs,
	DenseMap<unsigned, MachineInstr*> &ImmDefMIs);

	/// \brief Finds recurrence cycles, but only ones that formulated around
	/// a def operand and a use operand that are tied. If there is a use
	/// operand commutable with the tied use operand, find recurrence cycle
	/// along that operand as well.
	bool findTargetRecurrence(unsigned Reg,
	const SmallSet<unsigned, 2> &TargetReg,
	RecurrenceCycle &RC);

	/// \brief If copy instruction \p MI is a virtual register copy, track it in
	/// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
	/// previously seen as a copy, replace the uses of this copy with the
	/// previously seen copy's destination register.
	bool foldRedundantCopy(MachineInstr *MI,
	SmallSet<unsigned, 4> &CopySrcRegs,
	DenseMap<unsigned, MachineInstr *> &CopyMIs);

	/// \brief Is the register \p Reg a non-allocatable physical register?
	bool isNAPhysCopy(unsigned Reg);

	/// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
	/// register copy, track it in the \p NAPhysToVirtMIs map. If this
	/// non-allocatable physical register was previously copied to a virtual
	/// registered and hasn't been clobbered, the virt->phys copy can be
	/// deleted.
	bool foldRedundantNAPhysCopy(
	MachineInstr *MI,
	DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);

	bool isLoadFoldable(MachineInstr *MI,
	SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);

	/// \brief Check whether \p MI is understood by the register coalescer
	/// but may require some rewriting.
	bool isCoalescableCopy(const MachineInstr &MI) {
	// SubregToRegs are not interesting, because they are already register
	// coalescer friendly.
	return MI.isCopy() \|\| (!DisableAdvCopyOpt &&
	(MI.isRegSequence() \|\| MI.isInsertSubreg() \|\|
	MI.isExtractSubreg()));
	}

	/// \brief Check whether \p MI is a copy like instruction that is
	/// not recognized by the register coalescer.
	bool isUncoalescableCopy(const MachineInstr &MI) {
	return MI.isBitcast() \|\|
	(!DisableAdvCopyOpt &&
	(MI.isRegSequenceLike() \|\| MI.isInsertSubregLike() \|\|
	MI.isExtractSubregLike()));
	}
	};

	/// \brief Helper class to hold instructions that are inside recurrence
	/// cycles. The recurrence cycle is formulated around 1) a def operand and its
	/// tied use operand, or 2) a def operand and a use operand that is commutable
	/// with another use operand which is tied to the def operand. In the latter
	/// case, index of the tied use operand and the commutable use operand are
	/// maintained with CommutePair.
	class RecurrenceInstr {
	public:
	using IndexPair = std::pair<unsigned, unsigned>;

	RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
	RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
	: MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}

	MachineInstr *getMI() const { return MI; }
	Optional<IndexPair> getCommutePair() const { return CommutePair; }

	private:
	MachineInstr *MI;
	Optional<IndexPair> CommutePair;
	};

	/// \brief Helper class to hold a reply for ValueTracker queries. Contains the
	/// returned sources for a given search and the instructions where the sources
	/// were tracked from.
	class ValueTrackerResult {
	private:
	/// Track all sources found by one ValueTracker query.
	SmallVector<TargetInstrInfo::RegSubRegPair, 2> RegSrcs;

	/// Instruction using the sources in 'RegSrcs'.
	const MachineInstr *Inst = nullptr;

	public:
	ValueTrackerResult() = default;

	ValueTrackerResult(unsigned Reg, unsigned SubReg) {
	addSource(Reg, SubReg);
	}

	bool isValid() const { return getNumSources() > 0; }

	void setInst(const MachineInstr *I) { Inst = I; }
	const MachineInstr *getInst() const { return Inst; }

	void clear() {
	RegSrcs.clear();
	Inst = nullptr;
	}

	void addSource(unsigned SrcReg, unsigned SrcSubReg) {
	RegSrcs.push_back(TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg));
	}

	void setSource(int Idx, unsigned SrcReg, unsigned SrcSubReg) {
	assert(Idx < getNumSources() && "Reg pair source out of index");
	RegSrcs[Idx] = TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg);
	}

	int getNumSources() const { return RegSrcs.size(); }

	unsigned getSrcReg(int Idx) const {
	assert(Idx < getNumSources() && "Reg source out of index");
	return RegSrcs[Idx].Reg;
	}

	unsigned getSrcSubReg(int Idx) const {
	assert(Idx < getNumSources() && "SubReg source out of index");
	return RegSrcs[Idx].SubReg;
	}

	bool operator==(const ValueTrackerResult &Other) {
	if (Other.getInst() != getInst())
	return false;

	if (Other.getNumSources() != getNumSources())
	return false;

	for (int i = 0, e = Other.getNumSources(); i != e; ++i)
	if (Other.getSrcReg(i) != getSrcReg(i) \|\|
	Other.getSrcSubReg(i) != getSrcSubReg(i))
	return false;
	return true;
	}
	};

	/// \brief Helper class to track the possible sources of a value defined by
	/// a (chain of) copy related instructions.
	/// Given a definition (instruction and definition index), this class
	/// follows the use-def chain to find successive suitable sources.
	/// The given source can be used to rewrite the definition into
	/// def = COPY src.
	///
	/// For instance, let us consider the following snippet:
	/// v0 =
	/// v2 = INSERT_SUBREG v1, v0, sub0
	/// def = COPY v2.sub0
	///
	/// Using a ValueTracker for def = COPY v2.sub0 will give the following
	/// suitable sources:
	/// v2.sub0 and v0.
	/// Then, def can be rewritten into def = COPY v0.
	class ValueTracker {
	private:
	/// The current point into the use-def chain.
	const MachineInstr *Def = nullptr;

	/// The index of the definition in Def.
	unsigned DefIdx = 0;

	/// The sub register index of the definition.
	unsigned DefSubReg;

	/// The register where the value can be found.
	unsigned Reg;

	/// Specifiy whether or not the value tracking looks through
	/// complex instructions. When this is false, the value tracker
	/// bails on everything that is not a copy or a bitcast.
	///
	/// Note: This could have been implemented as a specialized version of
	/// the ValueTracker class but that would have complicated the code of
	/// the users of this class.
	bool UseAdvancedTracking;

	/// MachineRegisterInfo used to perform tracking.
	const MachineRegisterInfo &MRI;

	/// Optional TargetInstrInfo used to perform some complex
	/// tracking.
	const TargetInstrInfo *TII;

	/// \brief Dispatcher to the right underlying implementation of
	/// getNextSource.
	ValueTrackerResult getNextSourceImpl();

	/// \brief Specialized version of getNextSource for Copy instructions.
	ValueTrackerResult getNextSourceFromCopy();

	/// \brief Specialized version of getNextSource for Bitcast instructions.
	ValueTrackerResult getNextSourceFromBitcast();

	/// \brief Specialized version of getNextSource for RegSequence
	/// instructions.
	ValueTrackerResult getNextSourceFromRegSequence();

	/// \brief Specialized version of getNextSource for InsertSubreg
	/// instructions.
	ValueTrackerResult getNextSourceFromInsertSubreg();

	/// \brief Specialized version of getNextSource for ExtractSubreg
	/// instructions.
	ValueTrackerResult getNextSourceFromExtractSubreg();

	/// \brief Specialized version of getNextSource for SubregToReg
	/// instructions.
	ValueTrackerResult getNextSourceFromSubregToReg();

	/// \brief Specialized version of getNextSource for PHI instructions.
	ValueTrackerResult getNextSourceFromPHI();

	public:
	/// \brief Create a ValueTracker instance for the value defined by \p Reg.
	/// \p DefSubReg represents the sub register index the value tracker will
	/// track. It does not need to match the sub register index used in the
	/// definition of \p Reg.
	/// \p UseAdvancedTracking specifies whether or not the value tracker looks
	/// through complex instructions. By default (false), it handles only copy
	/// and bitcast instructions.
	/// If \p Reg is a physical register, a value tracker constructed with
	/// this constructor will not find any alternative source.
	/// Indeed, when \p Reg is a physical register that constructor does not
	/// know which definition of \p Reg it should track.
	/// Use the next constructor to track a physical register.
	ValueTracker(unsigned Reg, unsigned DefSubReg,
	const MachineRegisterInfo &MRI,
	bool UseAdvancedTracking = false,
	const TargetInstrInfo *TII = nullptr)
	: DefSubReg(DefSubReg), Reg(Reg),
	UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
	if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
	Def = MRI.getVRegDef(Reg);
	DefIdx = MRI.def_begin(Reg).getOperandNo();
	}
	}

	/// \brief Create a ValueTracker instance for the value defined by
	/// the pair \p MI, \p DefIdx.
	/// Unlike the other constructor, the value tracker produced by this one
	/// may be able to find a new source when the definition is a physical
	/// register.
	/// This could be useful to rewrite target specific instructions into
	/// generic copy instructions.
	ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg,
	const MachineRegisterInfo &MRI,
	bool UseAdvancedTracking = false,
	const TargetInstrInfo *TII = nullptr)
	: Def(&MI), DefIdx(DefIdx), DefSubReg(DefSubReg),
	UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
	assert(DefIdx < Def->getDesc().getNumDefs() &&
	Def->getOperand(DefIdx).isReg() && "Invalid definition");
	Reg = Def->getOperand(DefIdx).getReg();
	}

	/// \brief Following the use-def chain, get the next available source
	/// for the tracked value.
	/// \return A ValueTrackerResult containing a set of registers
	/// and sub registers with tracked values. A ValueTrackerResult with
	/// an empty set of registers means no source was found.
	ValueTrackerResult getNextSource();

	/// \brief Get the last register where the initial value can be found.
	/// Initially this is the register of the definition.
	/// Then, after each successful call to getNextSource, this is the
	/// register of the last source.
	unsigned getReg() const { return Reg; }
	};

	} // end anonymous namespace

	char PeepholeOptimizer::ID = 0;

	char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;

	INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
	"Peephole Optimizations", false, false)
	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
	INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
	"Peephole Optimizations", false, false)

	/// If instruction is a copy-like instruction, i.e. it reads a single register
	/// and writes a single register and it does not modify the source, and if the
	/// source value is preserved as a sub-register of the result, then replace all
	/// reachable uses of the source with the subreg of the result.
	///
	/// Do not generate an EXTRACT that is used only in a debug use, as this changes
	/// the code. Since this code does not currently share EXTRACTs, just ignore all
	/// debug uses.
	bool PeepholeOptimizer::
	optimizeExtInstr(MachineInstr MI, MachineBasicBlock MBB,
	SmallPtrSetImpl<MachineInstr*> &LocalMIs) {
	unsigned SrcReg, DstReg, SubIdx;
	if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
	return false;

	if (TargetRegisterInfo::isPhysicalRegister(DstReg) \|\|
	TargetRegisterInfo::isPhysicalRegister(SrcReg))
	return false;

	if (MRI->hasOneNonDBGUse(SrcReg))
	// No other uses.
	return false;

	// Ensure DstReg can get a register class that actually supports
	// sub-registers. Don't change the class until we commit.
	const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
	DstRC = TRI->getSubClassWithSubReg(DstRC, SubIdx);
	if (!DstRC)
	return false;

	// The ext instr may be operating on a sub-register of SrcReg as well.
	// PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit
	// register.
	// If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
	// SrcReg:SubIdx should be replaced.
	bool UseSrcSubIdx =
	TRI->getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;

	// The source has other uses. See if we can replace the other uses with use of
	// the result of the extension.
	SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
	for (MachineInstr &UI : MRI->use_nodbg_instructions(DstReg))
	ReachedBBs.insert(UI.getParent());

	// Uses that are in the same BB of uses of the result of the instruction.
	SmallVector<MachineOperand*, 8> Uses;

	// Uses that the result of the instruction can reach.
	SmallVector<MachineOperand*, 8> ExtendedUses;

	bool ExtendLife = true;
	for (MachineOperand &UseMO : MRI->use_nodbg_operands(SrcReg)) {
	MachineInstr *UseMI = UseMO.getParent();
	if (UseMI == MI)
	continue;

	if (UseMI->isPHI()) {
	ExtendLife = false;
	continue;
	}

	// Only accept uses of SrcReg:SubIdx.
	if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx)
	continue;

	// It's an error to translate this:
	//
	// %reg1025 = <sext> %reg1024
	// ...
	// %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
	//
	// into this:
	//
	// %reg1025 = <sext> %reg1024
	// ...
	// %reg1027 = COPY %reg1025:4
	// %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
	//
	// The problem here is that SUBREG_TO_REG is there to assert that an
	// implicit zext occurs. It doesn't insert a zext instruction. If we allow
	// the COPY here, it will give us the value after the <sext>, not the
	// original value of %reg1024 before <sext>.
	if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
	continue;

	MachineBasicBlock *UseMBB = UseMI->getParent();
	if (UseMBB == MBB) {
	// Local uses that come after the extension.
	if (!LocalMIs.count(UseMI))
	Uses.push_back(&UseMO);
	} else if (ReachedBBs.count(UseMBB)) {
	// Non-local uses where the result of the extension is used. Always
	// replace these unless it's a PHI.
	Uses.push_back(&UseMO);
	} else if (Aggressive && DT->dominates(MBB, UseMBB)) {
	// We may want to extend the live range of the extension result in order
	// to replace these uses.
	ExtendedUses.push_back(&UseMO);
	} else {
	// Both will be live out of the def MBB anyway. Don't extend live range of
	// the extension result.
	ExtendLife = false;
	break;
	}
	}

	if (ExtendLife && !ExtendedUses.empty())
	// Extend the liveness of the extension result.
	Uses.append(ExtendedUses.begin(), ExtendedUses.end());

	// Now replace all uses.
	bool Changed = false;
	if (!Uses.empty()) {
	SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;

	// Look for PHI uses of the extended result, we don't want to extend the
	// liveness of a PHI input. It breaks all kinds of assumptions down
	// stream. A PHI use is expected to be the kill of its source values.
	for (MachineInstr &UI : MRI->use_nodbg_instructions(DstReg))
	if (UI.isPHI())
	PHIBBs.insert(UI.getParent());

	const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
	for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
	MachineOperand *UseMO = Uses[i];
	MachineInstr *UseMI = UseMO->getParent();
	MachineBasicBlock *UseMBB = UseMI->getParent();
	if (PHIBBs.count(UseMBB))
	continue;

	// About to add uses of DstReg, clear DstReg's kill flags.
	if (!Changed) {
	MRI->clearKillFlags(DstReg);
	MRI->constrainRegClass(DstReg, DstRC);
	}

	unsigned NewVR = MRI->createVirtualRegister(RC);
	MachineInstr Copy = BuildMI(UseMBB, UseMI, UseMI->getDebugLoc(),
	TII->get(TargetOpcode::COPY), NewVR)
	.addReg(DstReg, 0, SubIdx);
	// SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set.
	if (UseSrcSubIdx) {
	Copy->getOperand(0).setSubReg(SubIdx);
	Copy->getOperand(0).setIsUndef();
	}
	UseMO->setReg(NewVR);
	++NumReuse;
	Changed = true;
	}
	}

	return Changed;
	}

	/// If the instruction is a compare and the previous instruction it's comparing
	/// against already sets (or could be modified to set) the same flag as the
	/// compare, then we can remove the comparison and use the flag from the
	/// previous instruction.
	bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
	MachineBasicBlock *MBB) {
	// If this instruction is a comparison against zero and isn't comparing a
	// physical register, we can try to optimize it.
	unsigned SrcReg, SrcReg2;
	int CmpMask, CmpValue;
	if (!TII->analyzeCompare(*MI, SrcReg, SrcReg2, CmpMask, CmpValue) \|\|
	TargetRegisterInfo::isPhysicalRegister(SrcReg) \|\|
	(SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
	return false;

	// Attempt to optimize the comparison instruction.
	if (TII->optimizeCompareInstr(*MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
	++NumCmps;
	return true;
	}

	return false;
	}

	/// Optimize a select instruction.
	bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
	SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
	unsigned TrueOp = 0;
	unsigned FalseOp = 0;
	bool Optimizable = false;
	SmallVector<MachineOperand, 4> Cond;
	if (TII->analyzeSelect(*MI, Cond, TrueOp, FalseOp, Optimizable))
	return false;
	if (!Optimizable)
	return false;
	if (!TII->optimizeSelect(*MI, LocalMIs))
	return false;
	MI->eraseFromParent();
	++NumSelects;
	return true;
	}

	/// \brief Check if a simpler conditional branch can be
	/// generated
	bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
	return TII->optimizeCondBranch(*MI);
	}

	/// \brief Try to find the next source that share the same register file
	/// for the value defined by \p Reg and \p SubReg.
	/// When true is returned, the \p RewriteMap can be used by the client to
	/// retrieve all Def -> Use along the way up to the next source. Any found
	/// Use that is not itself a key for another entry, is the next source to
	/// use. During the search for the next source, multiple sources can be found
	/// given multiple incoming sources of a PHI instruction. In this case, we
	/// look in each PHI source for the next source; all found next sources must
	/// share the same register file as \p Reg and \p SubReg. The client should
	/// then be capable to rewrite all intermediate PHIs to get the next source.
	/// \return False if no alternative sources are available. True otherwise.
	bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
	RewriteMapTy &RewriteMap) {
	// Do not try to find a new source for a physical register.
	// So far we do not have any motivating example for doing that.
	// Thus, instead of maintaining untested code, we will revisit that if
	// that changes at some point.
	if (TargetRegisterInfo::isPhysicalRegister(Reg))
	return false;
	const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);

	SmallVector<TargetInstrInfo::RegSubRegPair, 4> SrcToLook;
	TargetInstrInfo::RegSubRegPair CurSrcPair(Reg, SubReg);
	SrcToLook.push_back(CurSrcPair);

	unsigned PHICount = 0;
	while (!SrcToLook.empty() && PHICount < RewritePHILimit) {
	TargetInstrInfo::RegSubRegPair Pair = SrcToLook.pop_back_val();
	// As explained above, do not handle physical registers
	if (TargetRegisterInfo::isPhysicalRegister(Pair.Reg))
	return false;

	CurSrcPair = Pair;
	ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI,
	!DisableAdvCopyOpt, TII);
	- ValueTrackerResult Res;
	- bool ShouldRewrite = false;

	- do {
	- // Follow the chain of copies until we reach the top of the use-def chain
	- // or find a more suitable source.
	- Res = ValTracker.getNextSource();
	+ // Follow the chain of copies until we find a more suitable source, a phi
	+ // or have to abort.
	+ while (true) {
	+ ValueTrackerResult Res = ValTracker.getNextSource();
	+ // Abort at the end of a chain (without finding a suitable source).
	if (!Res.isValid())
	- break;
	+ return false;

	// Insert the Def -> Use entry for the recently found source.
	ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
	if (CurSrcRes.isValid()) {
	assert(CurSrcRes == Res && "ValueTrackerResult found must match");
	// An existent entry with multiple sources is a PHI cycle we must avoid.
	// Otherwise it's an entry with a valid next source we already found.
	if (CurSrcRes.getNumSources() > 1) {
	DEBUG(dbgs() << "findNextSource: found PHI cycle, aborting...\n");
	return false;
	}
	break;
	}
	RewriteMap.insert(std::make_pair(CurSrcPair, Res));

	// ValueTrackerResult usually have one source unless it's the result from
	// a PHI instruction. Add the found PHI edges to be looked up further.
	unsigned NumSrcs = Res.getNumSources();
	if (NumSrcs > 1) {
	PHICount++;
	for (unsigned i = 0; i < NumSrcs; ++i)
	SrcToLook.push_back(TargetInstrInfo::RegSubRegPair(
	Res.getSrcReg(i), Res.getSrcSubReg(i)));
	break;
	}

	CurSrcPair.Reg = Res.getSrcReg(0);
	CurSrcPair.SubReg = Res.getSrcSubReg(0);
	// Do not extend the live-ranges of physical registers as they add
	// constraints to the register allocator. Moreover, if we want to extend
	// the live-range of a physical register, unlike SSA virtual register,
	// we will have to check that they aren't redefine before the related use.
	if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
	return false;

	+ // Keep following the chain if the value isn't any better yet.
	const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);
	- ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC,
	- CurSrcPair.SubReg);
	- } while (!ShouldRewrite);
	+ if (!TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, CurSrcPair.SubReg))
	+ continue;

	- // Continue looking for new sources...
	- if (Res.isValid())
	- continue;
	+ // We currently cannot deal with subreg operands on PHI instructions
	+ // (see insertPHI()).
	+ if (PHICount > 0 && CurSrcPair.SubReg != 0)
	+ continue;

	- // Do not continue searching for a new source if the there's at least
	- // one use-def which cannot be rewritten.
	- if (!ShouldRewrite)
	- return false;
	+ // We found a suitable source, and are done with this chain.
	+ break;
	+ }
	}

	- if (PHICount >= RewritePHILimit) {
	- DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
	- return false;
	- }
	-
	// If we did not find a more suitable source, there is nothing to optimize.
	return CurSrcPair.Reg != Reg;
	}

	/// \brief Insert a PHI instruction with incoming edges \p SrcRegs that are
	/// guaranteed to have the same register class. This is necessary whenever we
	/// successfully traverse a PHI instruction and find suitable sources coming
	/// from its edges. By inserting a new PHI, we provide a rewritten PHI def
	/// suitable to be used in a new COPY instruction.
	static MachineInstr *
	insertPHI(MachineRegisterInfo MRI, const TargetInstrInfo TII,
	const SmallVectorImpl<TargetInstrInfo::RegSubRegPair> &SrcRegs,
	MachineInstr *OrigPHI) {
	assert(!SrcRegs.empty() && "No sources to create a PHI instruction?");

	const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg);
	+ // NewRC is only correct if no subregisters are involved. findNextSource()
	+ // should have rejected those cases already.
	+ assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand");
	unsigned NewVR = MRI->createVirtualRegister(NewRC);
	MachineBasicBlock *MBB = OrigPHI->getParent();
	MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(),
	TII->get(TargetOpcode::PHI), NewVR);

	unsigned MBBOpIdx = 2;
	for (auto RegPair : SrcRegs) {
	MIB.addReg(RegPair.Reg, 0, RegPair.SubReg);
	MIB.addMBB(OrigPHI->getOperand(MBBOpIdx).getMBB());
	// Since we're extended the lifetime of RegPair.Reg, clear the
	// kill flags to account for that and make RegPair.Reg reaches
	// the new PHI.
	MRI->clearKillFlags(RegPair.Reg);
	MBBOpIdx += 2;
	}

	return MIB;
	}

	namespace {

	/// \brief Helper class to rewrite the arguments of a copy-like instruction.
	class CopyRewriter {
	protected:
	/// The copy-like instruction.
	MachineInstr &CopyLike;

	/// The index of the source being rewritten.
	unsigned CurrentSrcIdx = 0;

	public:
	CopyRewriter(MachineInstr &MI) : CopyLike(MI) {}
	virtual ~CopyRewriter() = default;

	/// \brief Get the next rewritable source (SrcReg, SrcSubReg) and
	/// the related value that it affects (TrackReg, TrackSubReg).
	/// A source is considered rewritable if its register class and the
	/// register class of the related TrackReg may not be register
	/// coalescer friendly. In other words, given a copy-like instruction
	/// not all the arguments may be returned at rewritable source, since
	/// some arguments are none to be register coalescer friendly.
	///
	/// Each call of this method moves the current source to the next
	/// rewritable source.
	/// For instance, let CopyLike be the instruction to rewrite.
	/// CopyLike has one definition and one source:
	/// dst.dstSubIdx = CopyLike src.srcSubIdx.
	///
	/// The first call will give the first rewritable source, i.e.,
	/// the only source this instruction has:
	/// (SrcReg, SrcSubReg) = (src, srcSubIdx).
	/// This source defines the whole definition, i.e.,
	/// (TrackReg, TrackSubReg) = (dst, dstSubIdx).
	///
	/// The second and subsequent calls will return false, as there is only one
	/// rewritable source.
	///
	/// \return True if a rewritable source has been found, false otherwise.
	/// The output arguments are valid if and only if true is returned.
	virtual bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
	unsigned &TrackReg,
	unsigned &TrackSubReg) {
	// If CurrentSrcIdx == 1, this means this function has already been called
	// once. CopyLike has one definition and one argument, thus, there is
	// nothing else to rewrite.
	if (!CopyLike.isCopy() \|\| CurrentSrcIdx == 1)
	return false;
	// This is the first call to getNextRewritableSource.
	// Move the CurrentSrcIdx to remember that we made that call.
	CurrentSrcIdx = 1;
	// The rewritable source is the argument.
	const MachineOperand &MOSrc = CopyLike.getOperand(1);
	SrcReg = MOSrc.getReg();
	SrcSubReg = MOSrc.getSubReg();
	// What we track are the alternative sources of the definition.
	const MachineOperand &MODef = CopyLike.getOperand(0);
	TrackReg = MODef.getReg();
	TrackSubReg = MODef.getSubReg();
	return true;
	}

	/// \brief Rewrite the current source with \p NewReg and \p NewSubReg
	/// if possible.
	/// \return True if the rewriting was possible, false otherwise.
	virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) {
	if (!CopyLike.isCopy() \|\| CurrentSrcIdx != 1)
	return false;
	MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
	MOSrc.setReg(NewReg);
	MOSrc.setSubReg(NewSubReg);
	return true;
	}

	/// \brief Given a \p Def.Reg and Def.SubReg pair, use \p RewriteMap to find
	/// the new source to use for rewrite. If \p HandleMultipleSources is true and
	/// multiple sources for a given \p Def are found along the way, we found a
	/// PHI instructions that needs to be rewritten.
	/// TODO: HandleMultipleSources should be removed once we test PHI handling
	/// with coalescable copies.
	TargetInstrInfo::RegSubRegPair
	getNewSource(MachineRegisterInfo MRI, const TargetInstrInfo TII,
	TargetInstrInfo::RegSubRegPair Def,
	PeepholeOptimizer::RewriteMapTy &RewriteMap,
	bool HandleMultipleSources = true) {
	TargetInstrInfo::RegSubRegPair LookupSrc(Def.Reg, Def.SubReg);
	do {
	ValueTrackerResult Res = RewriteMap.lookup(LookupSrc);
	// If there are no entries on the map, LookupSrc is the new source.
	if (!Res.isValid())
	return LookupSrc;

	// There's only one source for this definition, keep searching...
	unsigned NumSrcs = Res.getNumSources();
	if (NumSrcs == 1) {
	LookupSrc.Reg = Res.getSrcReg(0);
	LookupSrc.SubReg = Res.getSrcSubReg(0);
	continue;
	}

	// TODO: Remove once multiple srcs w/ coalescable copies are supported.
	if (!HandleMultipleSources)
	break;

	// Multiple sources, recurse into each source to find a new source
	// for it. Then, rewrite the PHI accordingly to its new edges.
	SmallVector<TargetInstrInfo::RegSubRegPair, 4> NewPHISrcs;
	for (unsigned i = 0; i < NumSrcs; ++i) {
	TargetInstrInfo::RegSubRegPair PHISrc(Res.getSrcReg(i),
	Res.getSrcSubReg(i));
	NewPHISrcs.push_back(
	getNewSource(MRI, TII, PHISrc, RewriteMap, HandleMultipleSources));
	}

	// Build the new PHI node and return its def register as the new source.
	MachineInstr OrigPHI = const_cast<MachineInstr >(Res.getInst());
	MachineInstr *NewPHI = insertPHI(MRI, TII, NewPHISrcs, OrigPHI);
	DEBUG(dbgs() << "-- getNewSource\n");
	DEBUG(dbgs() << " Replacing: " << *OrigPHI);
	DEBUG(dbgs() << " With: " << *NewPHI);
	const MachineOperand &MODef = NewPHI->getOperand(0);
	return TargetInstrInfo::RegSubRegPair(MODef.getReg(), MODef.getSubReg());

	} while (true);

	return TargetInstrInfo::RegSubRegPair(0, 0);
	}

	/// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
	/// and create a new COPY instruction. More info about RewriteMap in
	/// PeepholeOptimizer::findNextSource. Right now this is only used to handle
	/// Uncoalescable copies, since they are copy like instructions that aren't
	/// recognized by the register allocator.
	virtual MachineInstr *
	RewriteSource(TargetInstrInfo::RegSubRegPair Def,
	PeepholeOptimizer::RewriteMapTy &RewriteMap) {
	return nullptr;
	}
	};

	/// \brief Helper class to rewrite uncoalescable copy like instructions
	/// into new COPY (coalescable friendly) instructions.
	class UncoalescableRewriter : public CopyRewriter {
	protected:
	const TargetInstrInfo &TII;
	MachineRegisterInfo &MRI;

	/// The number of defs in the bitcast
	unsigned NumDefs;

	public:
	UncoalescableRewriter(MachineInstr &MI, const TargetInstrInfo &TII,
	MachineRegisterInfo &MRI)
	: CopyRewriter(MI), TII(TII), MRI(MRI) {
	NumDefs = MI.getDesc().getNumDefs();
	}

	/// \brief Get the next rewritable def source (TrackReg, TrackSubReg)
	/// All such sources need to be considered rewritable in order to
	/// rewrite a uncoalescable copy-like instruction. This method return
	/// each definition that must be checked if rewritable.
	bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
	unsigned &TrackReg,
	unsigned &TrackSubReg) override {
	// Find the next non-dead definition and continue from there.
	if (CurrentSrcIdx == NumDefs)
	return false;

	while (CopyLike.getOperand(CurrentSrcIdx).isDead()) {
	++CurrentSrcIdx;
	if (CurrentSrcIdx == NumDefs)
	return false;
	}

	// What we track are the alternative sources of the definition.
	const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx);
	TrackReg = MODef.getReg();
	TrackSubReg = MODef.getSubReg();

	CurrentSrcIdx++;
	return true;
	}

	/// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
	/// and create a new COPY instruction. More info about RewriteMap in
	/// PeepholeOptimizer::findNextSource. Right now this is only used to handle
	/// Uncoalescable copies, since they are copy like instructions that aren't
	/// recognized by the register allocator.
	MachineInstr *
	RewriteSource(TargetInstrInfo::RegSubRegPair Def,
	PeepholeOptimizer::RewriteMapTy &RewriteMap) override {
	assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
	"We do not rewrite physical registers");

	// Find the new source to use in the COPY rewrite.
	TargetInstrInfo::RegSubRegPair NewSrc =
	getNewSource(&MRI, &TII, Def, RewriteMap);

	// Insert the COPY.
	const TargetRegisterClass *DefRC = MRI.getRegClass(Def.Reg);
	unsigned NewVR = MRI.createVirtualRegister(DefRC);

	MachineInstr *NewCopy =
	BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
	TII.get(TargetOpcode::COPY), NewVR)
	.addReg(NewSrc.Reg, 0, NewSrc.SubReg);

	NewCopy->getOperand(0).setSubReg(Def.SubReg);
	if (Def.SubReg)
	NewCopy->getOperand(0).setIsUndef();

	DEBUG(dbgs() << "-- RewriteSource\n");
	DEBUG(dbgs() << " Replacing: " << CopyLike);
	DEBUG(dbgs() << " With: " << *NewCopy);
	MRI.replaceRegWith(Def.Reg, NewVR);
	MRI.clearKillFlags(NewVR);

	// We extended the lifetime of NewSrc.Reg, clear the kill flags to
	// account for that.
	MRI.clearKillFlags(NewSrc.Reg);

	return NewCopy;
	}
	};

	/// \brief Specialized rewriter for INSERT_SUBREG instruction.
	class InsertSubregRewriter : public CopyRewriter {
	public:
	InsertSubregRewriter(MachineInstr &MI) : CopyRewriter(MI) {
	assert(MI.isInsertSubreg() && "Invalid instruction");
	}

	/// \brief See CopyRewriter::getNextRewritableSource.
	/// Here CopyLike has the following form:
	/// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx.
	/// Src1 has the same register class has dst, hence, there is
	/// nothing to rewrite.
	/// Src2.src2SubIdx, may not be register coalescer friendly.
	/// Therefore, the first call to this method returns:
	/// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
	/// (TrackReg, TrackSubReg) = (dst, subIdx).
	///
	/// Subsequence calls will return false.
	bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
	unsigned &TrackReg,
	unsigned &TrackSubReg) override {
	// If we already get the only source we can rewrite, return false.
	if (CurrentSrcIdx == 2)
	return false;
	// We are looking at v2 = INSERT_SUBREG v0, v1, sub0.
	CurrentSrcIdx = 2;
	const MachineOperand &MOInsertedReg = CopyLike.getOperand(2);
	SrcReg = MOInsertedReg.getReg();
	SrcSubReg = MOInsertedReg.getSubReg();
	const MachineOperand &MODef = CopyLike.getOperand(0);

	// We want to track something that is compatible with the
	// partial definition.
	TrackReg = MODef.getReg();
	if (MODef.getSubReg())
	// Bail if we have to compose sub-register indices.
	return false;
	TrackSubReg = (unsigned)CopyLike.getOperand(3).getImm();
	return true;
	}

	bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
	if (CurrentSrcIdx != 2)
	return false;
	// We are rewriting the inserted reg.
	MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
	MO.setReg(NewReg);
	MO.setSubReg(NewSubReg);
	return true;
	}
	};

	/// \brief Specialized rewriter for EXTRACT_SUBREG instruction.
	class ExtractSubregRewriter : public CopyRewriter {
	const TargetInstrInfo &TII;

	public:
	ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII)
	: CopyRewriter(MI), TII(TII) {
	assert(MI.isExtractSubreg() && "Invalid instruction");
	}

	/// \brief See CopyRewriter::getNextRewritableSource.
	/// Here CopyLike has the following form:
	/// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx.
	/// There is only one rewritable source: Src.subIdx,
	/// which defines dst.dstSubIdx.
	bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
	unsigned &TrackReg,
	unsigned &TrackSubReg) override {
	// If we already get the only source we can rewrite, return false.
	if (CurrentSrcIdx == 1)
	return false;
	// We are looking at v1 = EXTRACT_SUBREG v0, sub0.
	CurrentSrcIdx = 1;
	const MachineOperand &MOExtractedReg = CopyLike.getOperand(1);
	SrcReg = MOExtractedReg.getReg();
	// If we have to compose sub-register indices, bail out.
	if (MOExtractedReg.getSubReg())
	return false;

	SrcSubReg = CopyLike.getOperand(2).getImm();

	// We want to track something that is compatible with the definition.
	const MachineOperand &MODef = CopyLike.getOperand(0);
	TrackReg = MODef.getReg();
	TrackSubReg = MODef.getSubReg();
	return true;
	}

	bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
	// The only source we can rewrite is the input register.
	if (CurrentSrcIdx != 1)
	return false;

	CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg);

	// If we find a source that does not require to extract something,
	// rewrite the operation with a copy.
	if (!NewSubReg) {
	// Move the current index to an invalid position.
	// We do not want another call to this method to be able
	// to do any change.
	CurrentSrcIdx = -1;
	// Rewrite the operation as a COPY.
	// Get rid of the sub-register index.
	CopyLike.RemoveOperand(2);
	// Morph the operation into a COPY.
	CopyLike.setDesc(TII.get(TargetOpcode::COPY));
	return true;
	}
	CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg);
	return true;
	}
	};

	/// \brief Specialized rewriter for REG_SEQUENCE instruction.
	class RegSequenceRewriter : public CopyRewriter {
	public:
	RegSequenceRewriter(MachineInstr &MI) : CopyRewriter(MI) {
	assert(MI.isRegSequence() && "Invalid instruction");
	}

	/// \brief See CopyRewriter::getNextRewritableSource.
	/// Here CopyLike has the following form:
	/// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2.
	/// Each call will return a different source, walking all the available
	/// source.
	///
	/// The first call returns:
	/// (SrcReg, SrcSubReg) = (Src1, src1SubIdx).
	/// (TrackReg, TrackSubReg) = (dst, subIdx1).
	///
	/// The second call returns:
	/// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
	/// (TrackReg, TrackSubReg) = (dst, subIdx2).
	///
	/// And so on, until all the sources have been traversed, then
	/// it returns false.
	bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
	unsigned &TrackReg,
	unsigned &TrackSubReg) override {
	// We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc.

	// If this is the first call, move to the first argument.
	if (CurrentSrcIdx == 0) {
	CurrentSrcIdx = 1;
	} else {
	// Otherwise, move to the next argument and check that it is valid.
	CurrentSrcIdx += 2;
	if (CurrentSrcIdx >= CopyLike.getNumOperands())
	return false;
	}
	const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
	SrcReg = MOInsertedReg.getReg();
	// If we have to compose sub-register indices, bail out.
	if ((SrcSubReg = MOInsertedReg.getSubReg()))
	return false;

	// We want to track something that is compatible with the related
	// partial definition.
	TrackSubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm();

	const MachineOperand &MODef = CopyLike.getOperand(0);
	TrackReg = MODef.getReg();
	// If we have to compose sub-registers, bail.
	return MODef.getSubReg() == 0;
	}

	bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
	// We cannot rewrite out of bound operands.
	// Moreover, rewritable sources are at odd positions.
	if ((CurrentSrcIdx & 1) != 1 \|\| CurrentSrcIdx > CopyLike.getNumOperands())
	return false;

	MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
	MO.setReg(NewReg);
	MO.setSubReg(NewSubReg);
	return true;
	}
	};

	} // end anonymous namespace

	/// \brief Get the appropriated CopyRewriter for \p MI.
	/// \return A pointer to a dynamically allocated CopyRewriter or nullptr
	/// if no rewriter works for \p MI.
	static CopyRewriter *getCopyRewriter(MachineInstr &MI,
	const TargetInstrInfo &TII,
	MachineRegisterInfo &MRI) {
	// Handle uncoalescable copy-like instructions.
	if (MI.isBitcast() \|\| (MI.isRegSequenceLike() \|\| MI.isInsertSubregLike() \|\|
	MI.isExtractSubregLike()))
	return new UncoalescableRewriter(MI, TII, MRI);

	switch (MI.getOpcode()) {
	default:
	return nullptr;
	case TargetOpcode::COPY:
	return new CopyRewriter(MI);
	case TargetOpcode::INSERT_SUBREG:
	return new InsertSubregRewriter(MI);
	case TargetOpcode::EXTRACT_SUBREG:
	return new ExtractSubregRewriter(MI, TII);
	case TargetOpcode::REG_SEQUENCE:
	return new RegSequenceRewriter(MI);
	}
	llvm_unreachable(nullptr);
	}

	/// \brief Optimize generic copy instructions to avoid cross
	/// register bank copy. The optimization looks through a chain of
	/// copies and tries to find a source that has a compatible register
	/// class.
	/// Two register classes are considered to be compatible if they share
	/// the same register bank.
	/// New copies issued by this optimization are register allocator
	/// friendly. This optimization does not remove any copy as it may
	/// overconstrain the register allocator, but replaces some operands
	/// when possible.
	/// \pre isCoalescableCopy(*MI) is true.
	/// \return True, when \p MI has been rewritten. False otherwise.
	bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr *MI) {
	assert(MI && isCoalescableCopy(*MI) && "Invalid argument");
	assert(MI->getDesc().getNumDefs() == 1 &&
	"Coalescer can understand multiple defs?!");
	const MachineOperand &MODef = MI->getOperand(0);
	// Do not rewrite physical definitions.
	if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
	return false;

	bool Changed = false;
	// Get the right rewriter for the current copy.
	std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(MI, TII, *MRI));
	// If none exists, bail out.
	if (!CpyRewriter)
	return false;
	// Rewrite each rewritable source.
	unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg;
	while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg,
	TrackSubReg)) {
	// Keep track of PHI nodes and its incoming edges when looking for sources.
	RewriteMapTy RewriteMap;
	// Try to find a more suitable source. If we failed to do so, or get the
	// actual source, move to the next source.
	if (!findNextSource(TrackReg, TrackSubReg, RewriteMap))
	continue;

	// Get the new source to rewrite. TODO: Only enable handling of multiple
	// sources (PHIs) once we have a motivating example and testcases for it.
	TargetInstrInfo::RegSubRegPair TrackPair(TrackReg, TrackSubReg);
	TargetInstrInfo::RegSubRegPair NewSrc = CpyRewriter->getNewSource(
	MRI, TII, TrackPair, RewriteMap, false /* multiple sources */);
	if (SrcReg == NewSrc.Reg \|\| NewSrc.Reg == 0)
	continue;

	// Rewrite source.
	if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) {
	// We may have extended the live-range of NewSrc, account for that.
	MRI->clearKillFlags(NewSrc.Reg);
	Changed = true;
	}
	}
	// TODO: We could have a clean-up method to tidy the instruction.
	// E.g., v0 = INSERT_SUBREG v1, v1.sub0, sub0
	// => v0 = COPY v1
	// Currently we haven't seen motivating example for that and we
	// want to avoid untested code.
	NumRewrittenCopies += Changed;
	return Changed;
	}

	/// \brief Optimize copy-like instructions to create
	/// register coalescer friendly instruction.
	/// The optimization tries to kill-off the \p MI by looking
	/// through a chain of copies to find a source that has a compatible
	/// register class.
	/// If such a source is found, it replace \p MI by a generic COPY
	/// operation.
	/// \pre isUncoalescableCopy(*MI) is true.
	/// \return True, when \p MI has been optimized. In that case, \p MI has
	/// been removed from its parent.
	/// All COPY instructions created, are inserted in \p LocalMIs.
	bool PeepholeOptimizer::optimizeUncoalescableCopy(
	MachineInstr MI, SmallPtrSetImpl<MachineInstr > &LocalMIs) {
	assert(MI && isUncoalescableCopy(*MI) && "Invalid argument");

	// Check if we can rewrite all the values defined by this instruction.
	SmallVector<TargetInstrInfo::RegSubRegPair, 4> RewritePairs;
	// Get the right rewriter for the current copy.
	std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(MI, TII, *MRI));
	// If none exists, bail out.
	if (!CpyRewriter)
	return false;

	// Rewrite each rewritable source by generating new COPYs. This works
	// differently from optimizeCoalescableCopy since it first makes sure that all
	// definitions can be rewritten.
	RewriteMapTy RewriteMap;
	unsigned Reg, SubReg, CopyDefReg, CopyDefSubReg;
	while (CpyRewriter->getNextRewritableSource(Reg, SubReg, CopyDefReg,
	CopyDefSubReg)) {
	// If a physical register is here, this is probably for a good reason.
	// Do not rewrite that.
	if (TargetRegisterInfo::isPhysicalRegister(CopyDefReg))
	return false;

	// If we do not know how to rewrite this definition, there is no point
	// in trying to kill this instruction.
	TargetInstrInfo::RegSubRegPair Def(CopyDefReg, CopyDefSubReg);
	if (!findNextSource(Def.Reg, Def.SubReg, RewriteMap))
	return false;

	RewritePairs.push_back(Def);
	}

	// The change is possible for all defs, do it.
	for (const auto &Def : RewritePairs) {
	// Rewrite the "copy" in a way the register coalescer understands.
	MachineInstr *NewCopy = CpyRewriter->RewriteSource(Def, RewriteMap);
	assert(NewCopy && "Should be able to always generate a new copy");
	LocalMIs.insert(NewCopy);
	}

	// MI is now dead.
	MI->eraseFromParent();
	++NumUncoalescableCopies;
	return true;
	}

	/// Check whether MI is a candidate for folding into a later instruction.
	/// We only fold loads to virtual registers and the virtual register defined
	/// has a single use.
	bool PeepholeOptimizer::isLoadFoldable(
	MachineInstr *MI, SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) {
	if (!MI->canFoldAsLoad() \|\| !MI->mayLoad())
	return false;
	const MCInstrDesc &MCID = MI->getDesc();
	if (MCID.getNumDefs() != 1)
	return false;

	unsigned Reg = MI->getOperand(0).getReg();
	// To reduce compilation time, we check MRI->hasOneNonDBGUse when inserting
	// loads. It should be checked when processing uses of the load, since
	// uses can be removed during peephole.
	if (!MI->getOperand(0).getSubReg() &&
	TargetRegisterInfo::isVirtualRegister(Reg) &&
	MRI->hasOneNonDBGUse(Reg)) {
	FoldAsLoadDefCandidates.insert(Reg);
	return true;
	}
	return false;
	}

	bool PeepholeOptimizer::isMoveImmediate(
	MachineInstr *MI, SmallSet<unsigned, 4> &ImmDefRegs,
	DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
	const MCInstrDesc &MCID = MI->getDesc();
	if (!MI->isMoveImmediate())
	return false;
	if (MCID.getNumDefs() != 1)
	return false;
	unsigned Reg = MI->getOperand(0).getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	ImmDefMIs.insert(std::make_pair(Reg, MI));
	ImmDefRegs.insert(Reg);
	return true;
	}

	return false;
	}

	/// Try folding register operands that are defined by move immediate
	/// instructions, i.e. a trivial constant folding optimization, if
	/// and only if the def and use are in the same BB.
	bool PeepholeOptimizer::foldImmediate(
	MachineInstr MI, MachineBasicBlock MBB, SmallSet<unsigned, 4> &ImmDefRegs,
	DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
	for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MI->getOperand(i);
	if (!MO.isReg() \|\| MO.isDef())
	continue;
	// Ignore dead implicit defs.
	if (MO.isImplicit() && MO.isDead())
	continue;
	unsigned Reg = MO.getReg();
	if (!TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	if (ImmDefRegs.count(Reg) == 0)
	continue;
	DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
	assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
	if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
	++NumImmFold;
	return true;
	}
	}
	return false;
	}

	// FIXME: This is very simple and misses some cases which should be handled when
	// motivating examples are found.
	//
	// The copy rewriting logic should look at uses as well as defs and be able to
	// eliminate copies across blocks.
	//
	// Later copies that are subregister extracts will also not be eliminated since
	// only the first copy is considered.
	//
	// e.g.
	// %1 = COPY %0
	// %2 = COPY %0:sub1
	//
	// Should replace %2 uses with %1:sub1
	bool PeepholeOptimizer::foldRedundantCopy(
	MachineInstr *MI, SmallSet<unsigned, 4> &CopySrcRegs,
	DenseMap<unsigned, MachineInstr *> &CopyMIs) {
	assert(MI->isCopy() && "expected a COPY machine instruction");

	unsigned SrcReg = MI->getOperand(1).getReg();
	if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
	return false;

	unsigned DstReg = MI->getOperand(0).getReg();
	if (!TargetRegisterInfo::isVirtualRegister(DstReg))
	return false;

	if (CopySrcRegs.insert(SrcReg).second) {
	// First copy of this reg seen.
	CopyMIs.insert(std::make_pair(SrcReg, MI));
	return false;
	}

	MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second;

	unsigned SrcSubReg = MI->getOperand(1).getSubReg();
	unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg();

	// Can't replace different subregister extracts.
	if (SrcSubReg != PrevSrcSubReg)
	return false;

	unsigned PrevDstReg = PrevCopy->getOperand(0).getReg();

	// Only replace if the copy register class is the same.
	//
	// TODO: If we have multiple copies to different register classes, we may want
	// to track multiple copies of the same source register.
	if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg))
	return false;

	MRI->replaceRegWith(DstReg, PrevDstReg);

	// Lifetime of the previous copy has been extended.
	MRI->clearKillFlags(PrevDstReg);
	return true;
	}

	bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
	return TargetRegisterInfo::isPhysicalRegister(Reg) &&
	!MRI->isAllocatable(Reg);
	}

	bool PeepholeOptimizer::foldRedundantNAPhysCopy(
	MachineInstr MI, DenseMap<unsigned, MachineInstr > &NAPhysToVirtMIs) {
	assert(MI->isCopy() && "expected a COPY machine instruction");

	if (DisableNAPhysCopyOpt)
	return false;

	unsigned DstReg = MI->getOperand(0).getReg();
	unsigned SrcReg = MI->getOperand(1).getReg();
	if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
	// %vreg = COPY %physreg
	// Avoid using a datastructure which can track multiple live non-allocatable
	// phys->virt copies since LLVM doesn't seem to do this.
	NAPhysToVirtMIs.insert({SrcReg, MI});
	return false;
	}

	if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
	return false;

	// %physreg = COPY %vreg
	auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
	if (PrevCopy == NAPhysToVirtMIs.end()) {
	// We can't remove the copy: there was an intervening clobber of the
	// non-allocatable physical register after the copy to virtual.
	DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
	<< '\n');
	return false;
	}

	unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
	if (PrevDstReg == SrcReg) {
	// Remove the virt->phys copy: we saw the virtual register definition, and
	// the non-allocatable physical register's state hasn't changed since then.
	DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
	++NumNAPhysCopies;
	return true;
	}

	// Potential missed optimization opportunity: we saw a different virtual
	// register get a copy of the non-allocatable physical register, and we only
	// track one such copy. Avoid getting confused by this new non-allocatable
	// physical register definition, and remove it from the tracked copies.
	DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
	NAPhysToVirtMIs.erase(PrevCopy);
	return false;
	}

	/// \bried Returns true if \p MO is a virtual register operand.
	static bool isVirtualRegisterOperand(MachineOperand &MO) {
	if (!MO.isReg())
	return false;
	return TargetRegisterInfo::isVirtualRegister(MO.getReg());
	}

	bool PeepholeOptimizer::findTargetRecurrence(
	unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs,
	RecurrenceCycle &RC) {
	// Recurrence found if Reg is in TargetRegs.
	if (TargetRegs.count(Reg))
	return true;

	// TODO: Curerntly, we only allow the last instruction of the recurrence
	// cycle (the instruction that feeds the PHI instruction) to have more than
	// one uses to guarantee that commuting operands does not tie registers
	// with overlapping live range. Once we have actual live range info of
	// each register, this constraint can be relaxed.
	if (!MRI->hasOneNonDBGUse(Reg))
	return false;

	// Give up if the reccurrence chain length is longer than the limit.
	if (RC.size() >= MaxRecurrenceChain)
	return false;

	MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg));
	unsigned Idx = MI.findRegisterUseOperandIdx(Reg);

	// Only interested in recurrences whose instructions have only one def, which
	// is a virtual register.
	if (MI.getDesc().getNumDefs() != 1)
	return false;

	MachineOperand &DefOp = MI.getOperand(0);
	if (!isVirtualRegisterOperand(DefOp))
	return false;

	// Check if def operand of MI is tied to any use operand. We are only
	// interested in the case that all the instructions in the recurrence chain
	// have there def operand tied with one of the use operand.
	unsigned TiedUseIdx;
	if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx))
	return false;

	if (Idx == TiedUseIdx) {
	RC.push_back(RecurrenceInstr(&MI));
	return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
	} else {
	// If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx.
	unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex;
	if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) {
	RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx));
	return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
	}
	}

	return false;
	}

	/// \brief Phi instructions will eventually be lowered to copy instructions. If
	/// phi is in a loop header, a recurrence may formulated around the source and
	/// destination of the phi. For such case commuting operands of the instructions
	/// in the recurrence may enable coalescing of the copy instruction generated
	/// from the phi. For example, if there is a recurrence of
	///
	/// LoopHeader:
	/// %1 = phi(%0, %100)
	/// LoopLatch:
	/// %0<def, tied1> = ADD %2<def, tied0>, %1
	///
	/// , the fact that %0 and %2 are in the same tied operands set makes
	/// the coalescing of copy instruction generated from the phi in
	/// LoopHeader(i.e. %1 = COPY %0) impossible, because %1 and
	/// %2 have overlapping live range. This introduces additional move
	/// instruction to the final assembly. However, if we commute %2 and
	/// %1 of ADD instruction, the redundant move instruction can be
	/// avoided.
	bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
	SmallSet<unsigned, 2> TargetRegs;
	for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
	MachineOperand &MO = PHI.getOperand(Idx);
	assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
	TargetRegs.insert(MO.getReg());
	}

	bool Changed = false;
	RecurrenceCycle RC;
	if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
	// Commutes operands of instructions in RC if necessary so that the copy to
	// be generated from PHI can be coalesced.
	DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
	for (auto &RI : RC) {
	DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
	auto CP = RI.getCommutePair();
	if (CP) {
	Changed = true;
	TII->commuteInstruction((RI.getMI()), false, (CP).first,
	(*CP).second);
	DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
	}
	}
	}

	return Changed;
	}

	bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;

	DEBUG(dbgs() << "******** PEEPHOLE OPTIMIZER ********\n");
	DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');

	if (DisablePeephole)
	return false;

	TII = MF.getSubtarget().getInstrInfo();
	TRI = MF.getSubtarget().getRegisterInfo();
	MRI = &MF.getRegInfo();
	DT = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
	MLI = &getAnalysis<MachineLoopInfo>();

	bool Changed = false;

	for (MachineBasicBlock &MBB : MF) {
	bool SeenMoveImm = false;

	// During this forward scan, at some point it needs to answer the question
	// "given a pointer to an MI in the current BB, is it located before or
	// after the current instruction".
	// To perform this, the following set keeps track of the MIs already seen
	// during the scan, if a MI is not in the set, it is assumed to be located
	// after. Newly created MIs have to be inserted in the set as well.
	SmallPtrSet<MachineInstr*, 16> LocalMIs;
	SmallSet<unsigned, 4> ImmDefRegs;
	DenseMap<unsigned, MachineInstr*> ImmDefMIs;
	SmallSet<unsigned, 16> FoldAsLoadDefCandidates;

	// Track when a non-allocatable physical register is copied to a virtual
	// register so that useless moves can be removed.
	//
	// %physreg is the map index; MI is the last valid `%vreg = COPY %physreg`
	// without any intervening re-definition of %physreg.
	DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;

	// Set of virtual registers that are copied from.
	SmallSet<unsigned, 4> CopySrcRegs;
	DenseMap<unsigned, MachineInstr *> CopySrcMIs;

	bool IsLoopHeader = MLI->isLoopHeader(&MBB);

	for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
	MII != MIE; ) {
	MachineInstr MI = &MII;
	// We may be erasing MI below, increment MII now.
	++MII;
	LocalMIs.insert(MI);

	// Skip debug values. They should not affect this peephole optimization.
	if (MI->isDebugValue())
	continue;

	if (MI->isPosition())
	continue;

	if (IsLoopHeader && MI->isPHI()) {
	if (optimizeRecurrence(*MI)) {
	Changed = true;
	continue;
	}
	}

	if (!MI->isCopy()) {
	for (const auto &Op : MI->operands()) {
	// Visit all operands: definitions can be implicit or explicit.
	if (Op.isReg()) {
	unsigned Reg = Op.getReg();
	if (Op.isDef() && isNAPhysCopy(Reg)) {
	const auto &Def = NAPhysToVirtMIs.find(Reg);
	if (Def != NAPhysToVirtMIs.end()) {
	// A new definition of the non-allocatable physical register
	// invalidates previous copies.
	DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
	<< '\n');
	NAPhysToVirtMIs.erase(Def);
	}
	}
	} else if (Op.isRegMask()) {
	const uint32_t *RegMask = Op.getRegMask();
	for (auto &RegMI : NAPhysToVirtMIs) {
	unsigned Def = RegMI.first;
	if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
	DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
	<< '\n');
	NAPhysToVirtMIs.erase(Def);
	}
	}
	}
	}
	}

	if (MI->isImplicitDef() \|\| MI->isKill())
	continue;

	if (MI->isInlineAsm() \|\| MI->hasUnmodeledSideEffects()) {
	// Blow away all non-allocatable physical registers knowledge since we
	// don't know what's correct anymore.
	//
	// FIXME: handle explicit asm clobbers.
	DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
	<< '\n');
	NAPhysToVirtMIs.clear();
	}

	if ((isUncoalescableCopy(*MI) &&
	optimizeUncoalescableCopy(MI, LocalMIs)) \|\|
	(MI->isCompare() && optimizeCmpInstr(MI, &MBB)) \|\|
	(MI->isSelect() && optimizeSelect(MI, LocalMIs))) {
	// MI is deleted.
	LocalMIs.erase(MI);
	Changed = true;
	continue;
	}

	if (MI->isConditionalBranch() && optimizeCondBranch(MI)) {
	Changed = true;
	continue;
	}

	if (isCoalescableCopy(*MI) && optimizeCoalescableCopy(MI)) {
	// MI is just rewritten.
	Changed = true;
	continue;
	}

	if (MI->isCopy() &&
	(foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) \|\|
	foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
	LocalMIs.erase(MI);
	MI->eraseFromParent();
	Changed = true;
	continue;
	}

	if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
	SeenMoveImm = true;
	} else {
	Changed \|= optimizeExtInstr(MI, &MBB, LocalMIs);
	// optimizeExtInstr might have created new instructions after MI
	// and before the already incremented MII. Adjust MII so that the
	// next iteration sees the new instructions.
	MII = MI;
	++MII;
	if (SeenMoveImm)
	Changed \|= foldImmediate(MI, &MBB, ImmDefRegs, ImmDefMIs);
	}

	// Check whether MI is a load candidate for folding into a later
	// instruction. If MI is not a candidate, check whether we can fold an
	// earlier load into MI.
	if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
	!FoldAsLoadDefCandidates.empty()) {

	// We visit each operand even after successfully folding a previous
	// one. This allows us to fold multiple loads into a single
	// instruction. We do assume that optimizeLoadInstr doesn't insert
	// foldable uses earlier in the argument list. Since we don't restart
	// iteration, we'd miss such cases.
	const MCInstrDesc &MIDesc = MI->getDesc();
	for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands();
	++i) {
	const MachineOperand &MOp = MI->getOperand(i);
	if (!MOp.isReg())
	continue;
	unsigned FoldAsLoadDefReg = MOp.getReg();
	if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) {
	// We need to fold load after optimizeCmpInstr, since
	// optimizeCmpInstr can enable folding by converting SUB to CMP.
	// Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
	// we need it for markUsesInDebugValueAsUndef().
	unsigned FoldedReg = FoldAsLoadDefReg;
	MachineInstr *DefMI = nullptr;
	if (MachineInstr *FoldMI =
	TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
	// Update LocalMIs since we replaced MI with FoldMI and deleted
	// DefMI.
	DEBUG(dbgs() << "Replacing: " << *MI);
	DEBUG(dbgs() << " With: " << *FoldMI);
	LocalMIs.erase(MI);
	LocalMIs.erase(DefMI);
	LocalMIs.insert(FoldMI);
	MI->eraseFromParent();
	DefMI->eraseFromParent();
	MRI->markUsesInDebugValueAsUndef(FoldedReg);
	FoldAsLoadDefCandidates.erase(FoldedReg);
	++NumLoadFold;

	// MI is replaced with FoldMI so we can continue trying to fold
	Changed = true;
	MI = FoldMI;
	}
	}
	}
	}

	// If we run into an instruction we can't fold across, discard
	// the load candidates. Note: We might be able to fold into this
	// instruction, so this needs to be after the folding logic.
	if (MI->isLoadFoldBarrier()) {
	DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n");
	FoldAsLoadDefCandidates.clear();
	}
	}
	}

	return Changed;
	}

	ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
	assert(Def->isCopy() && "Invalid definition");
	// Copy instruction are supposed to be: Def = Src.
	// If someone breaks this assumption, bad things will happen everywhere.
	assert(Def->getNumOperands() == 2 && "Invalid number of operands");

	if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
	// If we look for a different subreg, it means we want a subreg of src.
	// Bails as we do not support composing subregs yet.
	return ValueTrackerResult();
	// Otherwise, we want the whole source.
	const MachineOperand &Src = Def->getOperand(1);
	return ValueTrackerResult(Src.getReg(), Src.getSubReg());
	}

	ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
	assert(Def->isBitcast() && "Invalid definition");

	// Bail if there are effects that a plain copy will not expose.
	if (Def->hasUnmodeledSideEffects())
	return ValueTrackerResult();

	// Bitcasts with more than one def are not supported.
	if (Def->getDesc().getNumDefs() != 1)
	return ValueTrackerResult();
	const MachineOperand DefOp = Def->getOperand(DefIdx);
	if (DefOp.getSubReg() != DefSubReg)
	// If we look for a different subreg, it means we want a subreg of the src.
	// Bails as we do not support composing subregs yet.
	return ValueTrackerResult();

	unsigned SrcIdx = Def->getNumOperands();
	for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
	++OpIdx) {
	const MachineOperand &MO = Def->getOperand(OpIdx);
	if (!MO.isReg() \|\| !MO.getReg())
	continue;
	// Ignore dead implicit defs.
	if (MO.isImplicit() && MO.isDead())
	continue;
	assert(!MO.isDef() && "We should have skipped all the definitions by now");
	if (SrcIdx != EndOpIdx)
	// Multiple sources?
	return ValueTrackerResult();
	SrcIdx = OpIdx;
	}

	// Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY
	// will break the assumed guarantees for the upper bits.
	for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) {
	if (UseMI.isSubregToReg())
	return ValueTrackerResult();
	}

	const MachineOperand &Src = Def->getOperand(SrcIdx);
	return ValueTrackerResult(Src.getReg(), Src.getSubReg());
	}

	ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() {
	assert((Def->isRegSequence() \|\| Def->isRegSequenceLike()) &&
	"Invalid definition");

	if (Def->getOperand(DefIdx).getSubReg())
	// If we are composing subregs, bail out.
	// The case we are checking is Def.<subreg> = REG_SEQUENCE.
	// This should almost never happen as the SSA property is tracked at
	// the register level (as opposed to the subreg level).
	// I.e.,
	// Def.sub0 =
	// Def.sub1 =
	// is a valid SSA representation for Def.sub0 and Def.sub1, but not for
	// Def. Thus, it must not be generated.
	// However, some code could theoretically generates a single
	// Def.sub0 (i.e, not defining the other subregs) and we would
	// have this case.
	// If we can ascertain (or force) that this never happens, we could
	// turn that into an assertion.
	return ValueTrackerResult();

	if (!TII)
	// We could handle the REG_SEQUENCE here, but we do not want to
	// duplicate the code from the generic TII.
	return ValueTrackerResult();

	SmallVector<TargetInstrInfo::RegSubRegPairAndIdx, 8> RegSeqInputRegs;
	if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
	return ValueTrackerResult();

	// We are looking at:
	// Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
	// Check if one of the operand defines the subreg we are interested in.
	for (auto &RegSeqInput : RegSeqInputRegs) {
	if (RegSeqInput.SubIdx == DefSubReg) {
	if (RegSeqInput.SubReg)
	// Bail if we have to compose sub registers.
	return ValueTrackerResult();

	return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg);
	}
	}

	// If the subreg we are tracking is super-defined by another subreg,
	// we could follow this value. However, this would require to compose
	// the subreg and we do not do that for now.
	return ValueTrackerResult();
	}

	ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() {
	assert((Def->isInsertSubreg() \|\| Def->isInsertSubregLike()) &&
	"Invalid definition");

	if (Def->getOperand(DefIdx).getSubReg())
	// If we are composing subreg, bail out.
	// Same remark as getNextSourceFromRegSequence.
	// I.e., this may be turned into an assert.
	return ValueTrackerResult();

	if (!TII)
	// We could handle the REG_SEQUENCE here, but we do not want to
	// duplicate the code from the generic TII.
	return ValueTrackerResult();

	TargetInstrInfo::RegSubRegPair BaseReg;
	TargetInstrInfo::RegSubRegPairAndIdx InsertedReg;
	if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg))
	return ValueTrackerResult();

	// We are looking at:
	// Def = INSERT_SUBREG v0, v1, sub1
	// There are two cases:
	// 1. DefSubReg == sub1, get v1.
	// 2. DefSubReg != sub1, the value may be available through v0.

	// #1 Check if the inserted register matches the required sub index.
	if (InsertedReg.SubIdx == DefSubReg) {
	return ValueTrackerResult(InsertedReg.Reg, InsertedReg.SubReg);
	}
	// #2 Otherwise, if the sub register we are looking for is not partial
	// defined by the inserted element, we can look through the main
	// register (v0).
	const MachineOperand &MODef = Def->getOperand(DefIdx);
	// If the result register (Def) and the base register (v0) do not
	// have the same register class or if we have to compose
	// subregisters, bail out.
	if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) \|\|
	BaseReg.SubReg)
	return ValueTrackerResult();

	// Get the TRI and check if the inserted sub-register overlaps with the
	// sub-register we are tracking.
	const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
	if (!TRI \|\|
	!(TRI->getSubRegIndexLaneMask(DefSubReg) &
	TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)).none())
	return ValueTrackerResult();
	// At this point, the value is available in v0 via the same subreg
	// we used for Def.
	return ValueTrackerResult(BaseReg.Reg, DefSubReg);
	}

	ValueTrackerResult ValueTracker::getNextSourceFromExtractSubreg() {
	assert((Def->isExtractSubreg() \|\|
	Def->isExtractSubregLike()) && "Invalid definition");
	// We are looking at:
	// Def = EXTRACT_SUBREG v0, sub0

	// Bail if we have to compose sub registers.
	// Indeed, if DefSubReg != 0, we would have to compose it with sub0.
	if (DefSubReg)
	return ValueTrackerResult();

	if (!TII)
	// We could handle the EXTRACT_SUBREG here, but we do not want to
	// duplicate the code from the generic TII.
	return ValueTrackerResult();

	TargetInstrInfo::RegSubRegPairAndIdx ExtractSubregInputReg;
	if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg))
	return ValueTrackerResult();

	// Bail if we have to compose sub registers.
	// Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
	if (ExtractSubregInputReg.SubReg)
	return ValueTrackerResult();
	// Otherwise, the value is available in the v0.sub0.
	return ValueTrackerResult(ExtractSubregInputReg.Reg,
	ExtractSubregInputReg.SubIdx);
	}

	ValueTrackerResult ValueTracker::getNextSourceFromSubregToReg() {
	assert(Def->isSubregToReg() && "Invalid definition");
	// We are looking at:
	// Def = SUBREG_TO_REG Imm, v0, sub0

	// Bail if we have to compose sub registers.
	// If DefSubReg != sub0, we would have to check that all the bits
	// we track are included in sub0 and if yes, we would have to
	// determine the right subreg in v0.
	if (DefSubReg != Def->getOperand(3).getImm())
	return ValueTrackerResult();
	// Bail if we have to compose sub registers.
	// Likewise, if v0.subreg != 0, we would have to compose it with sub0.
	if (Def->getOperand(2).getSubReg())
	return ValueTrackerResult();

	return ValueTrackerResult(Def->getOperand(2).getReg(),
	Def->getOperand(3).getImm());
	}

	/// \brief Explore each PHI incoming operand and return its sources
	ValueTrackerResult ValueTracker::getNextSourceFromPHI() {
	assert(Def->isPHI() && "Invalid definition");
	ValueTrackerResult Res;

	// If we look for a different subreg, bail as we do not support composing
	// subregs yet.
	if (Def->getOperand(0).getSubReg() != DefSubReg)
	return ValueTrackerResult();

	// Return all register sources for PHI instructions.
	for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) {
	auto &MO = Def->getOperand(i);
	assert(MO.isReg() && "Invalid PHI instruction");
	Res.addSource(MO.getReg(), MO.getSubReg());
	}

	return Res;
	}

	ValueTrackerResult ValueTracker::getNextSourceImpl() {
	assert(Def && "This method needs a valid definition");

	assert(((Def->getOperand(DefIdx).isDef() &&
	(DefIdx < Def->getDesc().getNumDefs() \|\|
	Def->getDesc().isVariadic())) \|\|
	Def->getOperand(DefIdx).isImplicit()) &&
	"Invalid DefIdx");
	if (Def->isCopy())
	return getNextSourceFromCopy();
	if (Def->isBitcast())
	return getNextSourceFromBitcast();
	// All the remaining cases involve "complex" instructions.
	// Bail if we did not ask for the advanced tracking.
	if (!UseAdvancedTracking)
	return ValueTrackerResult();
	if (Def->isRegSequence() \|\| Def->isRegSequenceLike())
	return getNextSourceFromRegSequence();
	if (Def->isInsertSubreg() \|\| Def->isInsertSubregLike())
	return getNextSourceFromInsertSubreg();
	if (Def->isExtractSubreg() \|\| Def->isExtractSubregLike())
	return getNextSourceFromExtractSubreg();
	if (Def->isSubregToReg())
	return getNextSourceFromSubregToReg();
	if (Def->isPHI())
	return getNextSourceFromPHI();
	return ValueTrackerResult();
	}

	ValueTrackerResult ValueTracker::getNextSource() {
	// If we reach a point where we cannot move up in the use-def chain,
	// there is nothing we can get.
	if (!Def)
	return ValueTrackerResult();

	ValueTrackerResult Res = getNextSourceImpl();
	if (Res.isValid()) {
	// Update definition, definition index, and subregister for the
	// next call of getNextSource.
	// Update the current register.
	bool OneRegSrc = Res.getNumSources() == 1;
	if (OneRegSrc)
	Reg = Res.getSrcReg(0);
	// Update the result before moving up in the use-def chain
	// with the instruction containing the last found sources.
	Res.setInst(Def);

	// If we can still move up in the use-def chain, move to the next
	// definition.
	if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) {
	Def = MRI.getVRegDef(Reg);
	DefIdx = MRI.def_begin(Reg).getOperandNo();
	DefSubReg = Res.getSrcSubReg(0);
	return Res;
	}
	}
	// If we end up here, this means we will not be able to find another source
	// for the next iteration. Make sure any new call to getNextSource bails out
	// early by cutting the use-def chain.
	Def = nullptr;
	return Res;
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 328362)
	@@ -1,17733 +1,17751 @@
	//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run
	// both before and after the DAG is legalized.
	//
	// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
	// primarily intended to handle simplification opportunities that are implicit
	// in the LLVM IR and exposed by the various codegen lowering phases.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <functional>
	#include <iterator>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "dagcombine"

	STATISTIC(NodesCombined , "Number of dag nodes combined");
	STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
	STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
	STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
	STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
	STATISTIC(SlicedLoads, "Number of load sliced");

	static cl::opt<bool>
	CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
	cl::desc("Enable DAG combiner's use of IR alias analysis"));

	static cl::opt<bool>
	UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
	cl::desc("Enable DAG combiner's use of TBAA"));

	#ifndef NDEBUG
	static cl::opt<std::string>
	CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
	cl::desc("Only use DAG-combiner alias analysis in this"
	" function"));
	#endif

	/// Hidden option to stress test load slicing, i.e., when this option
	/// is enabled, load slicing bypasses most of its profitability guards.
	static cl::opt<bool>
	StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
	cl::desc("Bypass the profitability model of load slicing"),
	cl::init(false));

	static cl::opt<bool>
	MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
	cl::desc("DAG combiner may split indexing from loads"));

	namespace {

	class DAGCombiner {
	SelectionDAG &DAG;
	const TargetLowering &TLI;
	CombineLevel Level;
	CodeGenOpt::Level OptLevel;
	bool LegalOperations = false;
	bool LegalTypes = false;
	bool ForCodeSize;

	/// \brief Worklist of all of the nodes that need to be simplified.
	///
	/// This must behave as a stack -- new nodes to process are pushed onto the
	/// back and when processing we pop off of the back.
	///
	/// The worklist will not contain duplicates but may contain null entries
	/// due to nodes being deleted from the underlying DAG.
	SmallVector<SDNode *, 64> Worklist;

	/// \brief Mapping from an SDNode to its position on the worklist.
	///
	/// This is used to find and remove nodes from the worklist (by nulling
	/// them) when they are deleted from the underlying DAG. It relies on
	/// stable indices of nodes within the worklist.
	DenseMap<SDNode *, unsigned> WorklistMap;

	/// \brief Set of nodes which have been combined (at least once).
	///
	/// This is used to allow us to reliably add any operands of a DAG node
	/// which have not yet been combined to the worklist.
	SmallPtrSet<SDNode *, 32> CombinedNodes;

	// AA - Used for DAG load/store alias analysis.
	AliasAnalysis *AA;

	/// When an instruction is simplified, add all users of the instruction to
	/// the work lists because they might get more simplified now.
	void AddUsersToWorklist(SDNode *N) {
	for (SDNode *Node : N->uses())
	AddToWorklist(Node);
	}

	/// Call the node-specific routine that folds each particular type of node.
	SDValue visit(SDNode *N);

	public:
	DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
	: DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
	OptLevel(OL), AA(AA) {
	ForCodeSize = DAG.getMachineFunction().getFunction().optForSize();

	MaximumLegalStoreInBits = 0;
	for (MVT VT : MVT::all_valuetypes())
	if (EVT(VT).isSimple() && VT != MVT::Other &&
	TLI.isTypeLegal(EVT(VT)) &&
	VT.getSizeInBits() >= MaximumLegalStoreInBits)
	MaximumLegalStoreInBits = VT.getSizeInBits();
	}

	/// Add to the worklist making sure its instance is at the back (next to be
	/// processed.)
	void AddToWorklist(SDNode *N) {
	assert(N->getOpcode() != ISD::DELETED_NODE &&
	"Deleted Node added to Worklist");

	// Skip handle nodes as they can't usefully be combined and confuse the
	// zero-use deletion strategy.
	if (N->getOpcode() == ISD::HANDLENODE)
	return;

	if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
	Worklist.push_back(N);
	}

	/// Remove all instances of N from the worklist.
	void removeFromWorklist(SDNode *N) {
	CombinedNodes.erase(N);

	auto It = WorklistMap.find(N);
	if (It == WorklistMap.end())
	return; // Not in the worklist.

	// Null out the entry rather than erasing it to avoid a linear operation.
	Worklist[It->second] = nullptr;
	WorklistMap.erase(It);
	}

	void deleteAndRecombine(SDNode *N);
	bool recursivelyDeleteUnusedNodes(SDNode *N);

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode N, const SDValue To, unsigned NumTo,
	bool AddTo = true);

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
	return CombineTo(N, &Res, 1, AddTo);
	}

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
	bool AddTo = true) {
	SDValue To[] = { Res0, Res1 };
	return CombineTo(N, To, 2, AddTo);
	}

	void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);

	private:
	unsigned MaximumLegalStoreInBits;

	/// Check the specified integer node value to see if it can be simplified or
	/// if things it uses can be simplified by bit propagation.
	/// If so, return true.
	bool SimplifyDemandedBits(SDValue Op) {
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	APInt Demanded = APInt::getAllOnesValue(BitWidth);
	return SimplifyDemandedBits(Op, Demanded);
	}

	bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);

	bool CombineToPreIndexedLoadStore(SDNode *N);
	bool CombineToPostIndexedLoadStore(SDNode *N);
	SDValue SplitIndexingFromLoad(LoadSDNode *LD);
	bool SliceUpLoad(SDNode *N);

	/// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
	/// load.
	///
	/// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
	/// \param InVecVT type of the input vector to EVE with bitcasts resolved.
	/// \param EltNo index of the vector element to load.
	/// \param OriginalLoad load that EVE came from to be replaced.
	/// \returns EVE on success SDValue() on failure.
	SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
	SDNode EVE, EVT InVecVT, SDValue EltNo, LoadSDNode OriginalLoad);
	void ReplaceLoadWithPromotedLoad(SDNode Load, SDNode ExtLoad);
	SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
	SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
	SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
	SDValue PromoteIntBinOp(SDValue Op);
	SDValue PromoteIntShiftOp(SDValue Op);
	SDValue PromoteExtend(SDValue Op);
	bool PromoteLoad(SDValue Op);

	void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, SDValue Trunc,
	SDValue ExtLoad, const SDLoc &DL,
	ISD::NodeType ExtType);

	/// Call the node-specific routine that knows how to fold each
	/// particular type of node. If that doesn't do anything, try the
	/// target-specific DAG combines.
	SDValue combine(SDNode *N);

	// Visitation implementation - Implement dag node combining for different
	// node types. The semantics are as follows:
	// Return Value:
	// SDValue.getNode() == 0 - No change was made
	// SDValue.getNode() == N - N was replaced, is dead and has been handled.
	// otherwise - N should be replaced by the returned Operand.
	//
	SDValue visitTokenFactor(SDNode *N);
	SDValue visitMERGE_VALUES(SDNode *N);
	SDValue visitADD(SDNode *N);
	SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitSUB(SDNode *N);
	SDValue visitADDC(SDNode *N);
	SDValue visitUADDO(SDNode *N);
	SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitSUBC(SDNode *N);
	SDValue visitUSUBO(SDNode *N);
	SDValue visitADDE(SDNode *N);
	SDValue visitADDCARRY(SDNode *N);
	SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
	SDValue visitSUBE(SDNode *N);
	SDValue visitSUBCARRY(SDNode *N);
	SDValue visitMUL(SDNode *N);
	SDValue useDivRem(SDNode *N);
	SDValue visitSDIV(SDNode *N);
	SDValue visitUDIV(SDNode *N);
	SDValue visitREM(SDNode *N);
	SDValue visitMULHU(SDNode *N);
	SDValue visitMULHS(SDNode *N);
	SDValue visitSMUL_LOHI(SDNode *N);
	SDValue visitUMUL_LOHI(SDNode *N);
	SDValue visitSMULO(SDNode *N);
	SDValue visitUMULO(SDNode *N);
	SDValue visitIMINMAX(SDNode *N);
	SDValue visitAND(SDNode *N);
	SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitOR(SDNode *N);
	SDValue visitORLike(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitXOR(SDNode *N);
	SDValue SimplifyVBinOp(SDNode *N);
	SDValue visitSHL(SDNode *N);
	SDValue visitSRA(SDNode *N);
	SDValue visitSRL(SDNode *N);
	SDValue visitRotate(SDNode *N);
	SDValue visitABS(SDNode *N);
	SDValue visitBSWAP(SDNode *N);
	SDValue visitBITREVERSE(SDNode *N);
	SDValue visitCTLZ(SDNode *N);
	SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
	SDValue visitCTTZ(SDNode *N);
	SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
	SDValue visitCTPOP(SDNode *N);
	SDValue visitSELECT(SDNode *N);
	SDValue visitVSELECT(SDNode *N);
	SDValue visitSELECT_CC(SDNode *N);
	SDValue visitSETCC(SDNode *N);
	SDValue visitSETCCE(SDNode *N);
	SDValue visitSETCCCARRY(SDNode *N);
	SDValue visitSIGN_EXTEND(SDNode *N);
	SDValue visitZERO_EXTEND(SDNode *N);
	SDValue visitANY_EXTEND(SDNode *N);
	SDValue visitAssertExt(SDNode *N);
	SDValue visitSIGN_EXTEND_INREG(SDNode *N);
	SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
	SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
	SDValue visitTRUNCATE(SDNode *N);
	SDValue visitBITCAST(SDNode *N);
	SDValue visitBUILD_PAIR(SDNode *N);
	SDValue visitFADD(SDNode *N);
	SDValue visitFSUB(SDNode *N);
	SDValue visitFMUL(SDNode *N);
	SDValue visitFMA(SDNode *N);
	SDValue visitFDIV(SDNode *N);
	SDValue visitFREM(SDNode *N);
	SDValue visitFSQRT(SDNode *N);
	SDValue visitFCOPYSIGN(SDNode *N);
	SDValue visitSINT_TO_FP(SDNode *N);
	SDValue visitUINT_TO_FP(SDNode *N);
	SDValue visitFP_TO_SINT(SDNode *N);
	SDValue visitFP_TO_UINT(SDNode *N);
	SDValue visitFP_ROUND(SDNode *N);
	SDValue visitFP_ROUND_INREG(SDNode *N);
	SDValue visitFP_EXTEND(SDNode *N);
	SDValue visitFNEG(SDNode *N);
	SDValue visitFABS(SDNode *N);
	SDValue visitFCEIL(SDNode *N);
	SDValue visitFTRUNC(SDNode *N);
	SDValue visitFFLOOR(SDNode *N);
	SDValue visitFMINNUM(SDNode *N);
	SDValue visitFMAXNUM(SDNode *N);
	SDValue visitBRCOND(SDNode *N);
	SDValue visitBR_CC(SDNode *N);
	SDValue visitLOAD(SDNode *N);

	SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
	SDValue replaceStoreOfFPConstant(StoreSDNode *ST);

	SDValue visitSTORE(SDNode *N);
	SDValue visitINSERT_VECTOR_ELT(SDNode *N);
	SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
	SDValue visitBUILD_VECTOR(SDNode *N);
	SDValue visitCONCAT_VECTORS(SDNode *N);
	SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
	SDValue visitVECTOR_SHUFFLE(SDNode *N);
	SDValue visitSCALAR_TO_VECTOR(SDNode *N);
	SDValue visitINSERT_SUBVECTOR(SDNode *N);
	SDValue visitMLOAD(SDNode *N);
	SDValue visitMSTORE(SDNode *N);
	SDValue visitMGATHER(SDNode *N);
	SDValue visitMSCATTER(SDNode *N);
	SDValue visitFP_TO_FP16(SDNode *N);
	SDValue visitFP16_TO_FP(SDNode *N);

	SDValue visitFADDForFMACombine(SDNode *N);
	SDValue visitFSUBForFMACombine(SDNode *N);
	SDValue visitFMULForFMADistributiveCombine(SDNode *N);

	SDValue XformToShuffleWithZero(SDNode *N);
	SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue LHS,
	SDValue RHS);

	SDValue visitShiftByConstant(SDNode N, ConstantSDNode Amt);

	SDValue foldSelectOfConstants(SDNode *N);
	SDValue foldVSelectOfConstants(SDNode *N);
	SDValue foldBinOpIntoSelect(SDNode *BO);
	bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
	SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
	SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
	SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC,
	bool NotExtCompare = false);
	SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC);
	SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
	const SDLoc &DL);
	SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	const SDLoc &DL, bool foldBooleans = true);

	bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
	SDValue &CC) const;
	bool isOneUseSetCC(SDValue N) const;

	SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
	unsigned HiOp);
	SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
	SDValue CombineExtLoad(SDNode *N);
	SDValue combineRepeatedFPDivisors(SDNode *N);
	SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
	SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
	SDValue BuildSDIV(SDNode *N);
	SDValue BuildSDIVPow2(SDNode *N);
	SDValue BuildUDIV(SDNode *N);
	SDValue BuildLogBase2(SDValue Op, const SDLoc &DL);
	SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
	SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal);
	SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal);
	SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
	bool DemandHighBits = true);
	SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
	SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
	SDValue InnerPos, SDValue InnerNeg,
	unsigned PosOpcode, unsigned NegOpcode,
	const SDLoc &DL);
	SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
	SDValue MatchLoadCombine(SDNode *N);
	SDValue ReduceLoadWidth(SDNode *N);
	SDValue ReduceLoadOpStoreWidth(SDNode *N);
	SDValue splitMergedValStore(StoreSDNode *ST);
	SDValue TransformFPLoadStorePair(SDNode *N);
	SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
	SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
	SDValue reduceBuildVecToShuffle(SDNode *N);
	SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
	ArrayRef<int> VectorMask, SDValue VecIn1,
	SDValue VecIn2, unsigned LeftIdx);
	SDValue matchVSelectOpSizesWithSetCC(SDNode *N);

	/// Walk up chain skipping non-aliasing memory nodes,
	/// looking for aliasing nodes and adding them to the Aliases vector.
	void GatherAllAliases(SDNode *N, SDValue OriginalChain,
	SmallVectorImpl<SDValue> &Aliases);

	/// Return true if there is any possibility that the two addresses overlap.
	bool isAlias(LSBaseSDNode Op0, LSBaseSDNode Op1) const;

	/// Walk up chain skipping non-aliasing memory nodes, looking for a better
	/// chain (aliasing node.)
	SDValue FindBetterChain(SDNode *N, SDValue Chain);

	/// Try to replace a store and any possibly adjacent stores on
	/// consecutive chains with better chains. Return true only if St is
	/// replaced.
	///
	/// Notice that other chains may still be replaced even if the function
	/// returns false.
	bool findBetterNeighborChains(StoreSDNode *St);

	/// Match "(X shl/srl V1) & V2" where V2 may not be present.
	bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask);

	/// Holds a pointer to an LSBaseSDNode as well as information on where it
	/// is located in a sequence of memory operations connected by a chain.
	struct MemOpLink {
	// Ptr to the mem node.
	LSBaseSDNode *MemNode;

	// Offset from the base ptr.
	int64_t OffsetFromBase;

	MemOpLink(LSBaseSDNode *N, int64_t Offset)
	: MemNode(N), OffsetFromBase(Offset) {}
	};

	/// This is a helper function for visitMUL to check the profitability
	/// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
	/// MulNode is the original multiply, AddNode is (add x, c1),
	/// and ConstNode is c2.
	bool isMulAddWithConstProfitable(SDNode *MulNode,
	SDValue &AddNode,
	SDValue &ConstNode);

	/// This is a helper function for visitAND and visitZERO_EXTEND. Returns
	/// true if the (and (load x) c) pattern matches an extload. ExtVT returns
	/// the type of the loaded value to be extended.
	bool isAndLoadExtLoad(ConstantSDNode AndC, LoadSDNode LoadN,
	EVT LoadResultTy, EVT &ExtVT);

	/// Helper function to calculate whether the given Load can have its
	/// width reduced to ExtVT.
	bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
	EVT &ExtVT, unsigned ShAmt = 0);

	/// Used by BackwardsPropagateMask to find suitable loads.
	bool SearchForAndLoads(SDNode N, SmallPtrSetImpl<LoadSDNode> &Loads,
	SmallPtrSetImpl<SDNode*> &NodeWithConsts,
	ConstantSDNode Mask, SDNode &UncombinedNode);
	/// Attempt to propagate a given AND node back to load leaves so that they
	/// can be combined into narrow loads.
	bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);

	/// Helper function for MergeConsecutiveStores which merges the
	/// component store chains.
	SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumStores);

	/// This is a helper function for MergeConsecutiveStores. When the
	/// source elements of the consecutive stores are all constants or
	/// all extracted vector elements, try to merge them into one
	/// larger store introducing bitcasts if necessary. \return True
	/// if a merged store was created.
	bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
	EVT MemVT, unsigned NumStores,
	bool IsConstantSrc, bool UseVector,
	bool UseTrunc);

	/// This is a helper function for MergeConsecutiveStores. Stores
	/// that potentially may be merged with St are placed in
	/// StoreNodes.
	void getStoreMergeCandidates(StoreSDNode *St,
	SmallVectorImpl<MemOpLink> &StoreNodes);

	/// Helper function for MergeConsecutiveStores. Checks if
	/// candidate stores have indirect dependency through their
	/// operands. \return True if safe to merge.
	bool checkMergeStoreCandidatesForDependencies(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);

	/// Merge consecutive store operations into a wide store.
	/// This optimization uses wide integers or vectors when possible.
	/// \return number of stores that were merged into a merged store (the
	/// affected nodes are stored as a prefix in \p StoreNodes).
	bool MergeConsecutiveStores(StoreSDNode *N);

	/// \brief Try to transform a truncation where C is a constant:
	/// (trunc (and X, C)) -> (and (trunc X), (trunc C))
	///
	/// \p N needs to be a truncation and its first operand an AND. Other
	/// requirements are checked by the function (e.g. that trunc is
	/// single-use) and if missed an empty SDValue is returned.
	SDValue distributeTruncateThroughAnd(SDNode *N);

	public:
	/// Runs the dag combiner on all nodes in the work list
	void Run(CombineLevel AtLevel);

	SelectionDAG &getDAG() const { return DAG; }

	/// Returns a type large enough to hold any valid shift amount - before type
	/// legalization these can be huge.
	EVT getShiftAmountTy(EVT LHSTy) {
	assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
	if (LHSTy.isVector())
	return LHSTy;
	auto &DL = DAG.getDataLayout();
	return LegalTypes ? TLI.getScalarShiftAmountTy(DL, LHSTy)
	: TLI.getPointerTy(DL);
	}

	/// This method returns true if we are running before type legalization or
	/// if the specified VT is legal.
	bool isTypeLegal(const EVT &VT) {
	if (!LegalTypes) return true;
	return TLI.isTypeLegal(VT);
	}

	/// Convenience wrapper around TargetLowering::getSetCCResultType
	EVT getSetCCResultType(EVT VT) const {
	return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	}
	};

	/// This class is a DAGUpdateListener that removes any deleted
	/// nodes from the worklist.
	class WorklistRemover : public SelectionDAG::DAGUpdateListener {
	DAGCombiner &DC;

	public:
	explicit WorklistRemover(DAGCombiner &dc)
	: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}

	void NodeDeleted(SDNode N, SDNode E) override {
	DC.removeFromWorklist(N);
	}
	};

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// TargetLowering::DAGCombinerInfo implementation
	//===----------------------------------------------------------------------===//

	void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
	((DAGCombiner*)DC)->AddToWorklist(N);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, SDValue Res, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
	}

	void TargetLowering::DAGCombinerInfo::
	CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
	return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
	}

	//===----------------------------------------------------------------------===//
	// Helper Functions
	//===----------------------------------------------------------------------===//

	void DAGCombiner::deleteAndRecombine(SDNode *N) {
	removeFromWorklist(N);

	// If the operands of this node are only used by the node, they will now be
	// dead. Make sure to re-visit them and recursively delete dead nodes.
	for (const SDValue &Op : N->ops())
	// For an operand generating multiple values, one of the values may
	// become dead allowing further simplification (e.g. split index
	// arithmetic from an indexed load).
	if (Op->hasOneUse() \|\| Op->getNumValues() > 1)
	AddToWorklist(Op.getNode());

	DAG.DeleteNode(N);
	}

	/// Return 1 if we can compute the negated form of the specified expression for
	/// the same cost as the expression itself, or 2 if we can compute the negated
	/// form more cheaply than the expression itself.
	static char isNegatibleForFree(SDValue Op, bool LegalOperations,
	const TargetLowering &TLI,
	const TargetOptions *Options,
	unsigned Depth = 0) {
	// fneg is removable even if it has multiple uses.
	if (Op.getOpcode() == ISD::FNEG) return 2;

	// Don't allow anything with multiple uses.
	if (!Op.hasOneUse()) return 0;

	// Don't recurse exponentially.
	if (Depth > 6) return 0;

	switch (Op.getOpcode()) {
	default: return false;
	case ISD::ConstantFP: {
	if (!LegalOperations)
	return 1;

	// Don't invert constant FP values after legalization unless the target says
	// the negated constant is legal.
	EVT VT = Op.getValueType();
	return TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT);
	}
	case ISD::FADD:
	// FIXME: determine better conditions for this xform.
	if (!Options->UnsafeFPMath) return 0;

	// After operation legalization, it might not be legal to create new FSUBs.
	if (LegalOperations &&
	!TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType()))
	return 0;

	// fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
	if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
	Options, Depth + 1))
	return V;
	// fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
	return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
	Depth + 1);
	case ISD::FSUB:
	// We can't turn -(A-B) into B-A when we honor signed zeros.
	if (!Options->NoSignedZerosFPMath &&
	!Op.getNode()->getFlags().hasNoSignedZeros())
	return 0;

	// fold (fneg (fsub A, B)) -> (fsub B, A)
	return 1;

	case ISD::FMUL:
	case ISD::FDIV:
	if (Options->HonorSignDependentRoundingFPMath()) return 0;

	// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
	if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
	Options, Depth + 1))
	return V;

	return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
	Depth + 1);

	case ISD::FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::FSIN:
	return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
	Depth + 1);
	}
	}

	/// If isNegatibleForFree returns true, return the newly negated expression.
	static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations, unsigned Depth = 0) {
	const TargetOptions &Options = DAG.getTarget().Options;
	// fneg is removable even if it has multiple uses.
	if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);

	// Don't allow anything with multiple uses.
	assert(Op.hasOneUse() && "Unknown reuse!");

	assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");

	const SDNodeFlags Flags = Op.getNode()->getFlags();

	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown code");
	case ISD::ConstantFP: {
	APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
	V.changeSign();
	return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
	}
	case ISD::FADD:
	// FIXME: determine better conditions for this xform.
	assert(Options.UnsafeFPMath);

	// fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
	if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
	DAG.getTargetLoweringInfo(), &Options, Depth+1))
	return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(1), Flags);
	// fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
	return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(1), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(0), Flags);
	case ISD::FSUB:
	// fold (fneg (fsub 0, B)) -> B
	if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
	if (N0CFP->isZero())
	return Op.getOperand(1);

	// fold (fneg (fsub A, B)) -> (fsub B, A)
	return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(0), Flags);

	case ISD::FMUL:
	case ISD::FDIV:
	assert(!Options.HonorSignDependentRoundingFPMath());

	// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
	if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
	DAG.getTargetLoweringInfo(), &Options, Depth+1))
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(1), Flags);

	// fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
	Op.getOperand(0),
	GetNegatedExpression(Op.getOperand(1), DAG,
	LegalOperations, Depth+1), Flags);

	case ISD::FP_EXTEND:
	case ISD::FSIN:
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1));
	case ISD::FP_ROUND:
	return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(1));
	}
	}

	// APInts must be the same size for most operations, this helper
	// function zero extends the shorter of the pair so that they match.
	// We provide an Offset so that we can create bitwidths that won't overflow.
	static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
	unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
	LHS = LHS.zextOrSelf(Bits);
	RHS = RHS.zextOrSelf(Bits);
	}

	// Return true if this node is a setcc, or is a select_cc
	// that selects between the target values used for true and false, making it
	// equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
	// the appropriate nodes based on the type of node we are checking. This
	// simplifies life a bit for the callers.
	bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
	SDValue &CC) const {
	if (N.getOpcode() == ISD::SETCC) {
	LHS = N.getOperand(0);
	RHS = N.getOperand(1);
	CC = N.getOperand(2);
	return true;
	}

	if (N.getOpcode() != ISD::SELECT_CC \|\|
	!TLI.isConstTrueVal(N.getOperand(2).getNode()) \|\|
	!TLI.isConstFalseVal(N.getOperand(3).getNode()))
	return false;

	if (TLI.getBooleanContents(N.getValueType()) ==
	TargetLowering::UndefinedBooleanContent)
	return false;

	LHS = N.getOperand(0);
	RHS = N.getOperand(1);
	CC = N.getOperand(4);
	return true;
	}

	/// Return true if this is a SetCC-equivalent operation with only one use.
	/// If this is true, it allows the users to invert the operation for free when
	/// it is profitable to do so.
	bool DAGCombiner::isOneUseSetCC(SDValue N) const {
	SDValue N0, N1, N2;
	if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
	return true;
	return false;
	}

	// \brief Returns the SDNode if it is a constant float BuildVector
	// or constant float.
	static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
	if (isa<ConstantFPSDNode>(N))
	return N.getNode();
	if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
	return N.getNode();
	return nullptr;
	}

	// Determines if it is a constant integer or a build vector of constant
	// integers (and undefs).
	// Do not permit build vector implicit truncation.
	static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
	return !(Const->isOpaque() && NoOpaques);
	if (N.getOpcode() != ISD::BUILD_VECTOR)
	return false;
	unsigned BitWidth = N.getScalarValueSizeInBits();
	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
	if (!Const \|\| Const->getAPIntValue().getBitWidth() != BitWidth \|\|
	(Const->isOpaque() && NoOpaques))
	return false;
	}
	return true;
	}

	// Determines if it is a constant null integer or a splatted vector of a
	// constant null integer (with no undefs).
	// Build vector implicit truncation is not an issue for null values.
	static bool isNullConstantOrNullSplatConstant(SDValue N) {
	if (ConstantSDNode *Splat = isConstOrConstSplat(N))
	return Splat->isNullValue();
	return false;
	}

	// Determines if it is a constant integer of one or a splatted vector of a
	// constant integer of one (with no undefs).
	// Do not permit build vector implicit truncation.
	static bool isOneConstantOrOneSplatConstant(SDValue N) {
	unsigned BitWidth = N.getScalarValueSizeInBits();
	if (ConstantSDNode *Splat = isConstOrConstSplat(N))
	return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth;
	return false;
	}

	// Determines if it is a constant integer of all ones or a splatted vector of a
	// constant integer of all ones (with no undefs).
	// Do not permit build vector implicit truncation.
	static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) {
	unsigned BitWidth = N.getScalarValueSizeInBits();
	if (ConstantSDNode *Splat = isConstOrConstSplat(N))
	return Splat->isAllOnesValue() &&
	Splat->getAPIntValue().getBitWidth() == BitWidth;
	return false;
	}

	// Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
	// undef's.
	static bool isAnyConstantBuildVector(const SDNode *N) {
	return ISD::isBuildVectorOfConstantSDNodes(N) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(N);
	}

	// Attempt to match a unary predicate against a scalar/splat constant or
	// every element of a constant BUILD_VECTOR.
	static bool matchUnaryPredicate(SDValue Op,
	std::function<bool(ConstantSDNode *)> Match) {
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
	return Match(Cst);

	if (ISD::BUILD_VECTOR != Op.getOpcode())
	return false;

	EVT SVT = Op.getValueType().getScalarType();
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
	if (!Cst \|\| Cst->getValueType(0) != SVT \|\| !Match(Cst))
	return false;
	}
	return true;
	}

	// Attempt to match a binary predicate against a pair of scalar/splat constants
	// or every element of a pair of constant BUILD_VECTORs.
	static bool matchBinaryPredicate(
	SDValue LHS, SDValue RHS,
	std::function<bool(ConstantSDNode , ConstantSDNode )> Match) {
	if (LHS.getValueType() != RHS.getValueType())
	return false;

	if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
	if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
	return Match(LHSCst, RHSCst);

	if (ISD::BUILD_VECTOR != LHS.getOpcode() \|\|
	ISD::BUILD_VECTOR != RHS.getOpcode())
	return false;

	EVT SVT = LHS.getValueType().getScalarType();
	for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
	auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
	auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
	if (!LHSCst \|\| !RHSCst)
	return false;
	if (LHSCst->getValueType(0) != SVT \|\|
	LHSCst->getValueType(0) != RHSCst->getValueType(0))
	return false;
	if (!Match(LHSCst, RHSCst))
	return false;
	}
	return true;
	}

	SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
	SDValue N1) {
	EVT VT = N0.getValueType();
	if (N0.getOpcode() == Opc) {
	if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
	if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
	// reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
	if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R))
	return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
	return SDValue();
	}
	if (N0.hasOneUse()) {
	// reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
	// use
	SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
	if (!OpNode.getNode())
	return SDValue();
	AddToWorklist(OpNode.getNode());
	return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
	}
	}
	}

	if (N1.getOpcode() == Opc) {
	if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
	if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	// reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
	if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L))
	return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
	return SDValue();
	}
	if (N1.hasOneUse()) {
	// reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one
	// use
	SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0));
	if (!OpNode.getNode())
	return SDValue();
	AddToWorklist(OpNode.getNode());
	return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
	}
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::CombineTo(SDNode N, const SDValue To, unsigned NumTo,
	bool AddTo) {
	assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.1 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	To[0].getNode()->dump(&DAG);
	dbgs() << " and " << NumTo-1 << " other values\n");
	for (unsigned i = 0, e = NumTo; i != e; ++i)
	assert((!To[i].getNode() \|\|
	N->getValueType(i) == To[i].getValueType()) &&
	"Cannot combine value to value of different type!");

	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesWith(N, To);
	if (AddTo) {
	// Push the new nodes and any users onto the worklist
	for (unsigned i = 0, e = NumTo; i != e; ++i) {
	if (To[i].getNode()) {
	AddToWorklist(To[i].getNode());
	AddUsersToWorklist(To[i].getNode());
	}
	}
	}

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node.
	if (N->use_empty())
	deleteAndRecombine(N);
	return SDValue(N, 0);
	}

	void DAGCombiner::
	CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
	// Replace all uses. If any nodes become isomorphic to other nodes and
	// are deleted, make sure to remove them from our worklist.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);

	// Push the new node and any (possibly new) users onto the worklist.
	AddToWorklist(TLO.New.getNode());
	AddUsersToWorklist(TLO.New.getNode());

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node.
	if (TLO.Old.getNode()->use_empty())
	deleteAndRecombine(TLO.Old.getNode());
	}

	/// Check the specified integer node value to see if it can be simplified or if
	/// things it uses can be simplified by bit propagation. If so, return true.
	bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
	TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
	KnownBits Known;
	if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO))
	return false;

	// Revisit the node.
	AddToWorklist(Op.getNode());

	// Replace the old value with the new one.
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.2 ";
	TLO.Old.getNode()->dump(&DAG);
	dbgs() << "\nWith: ";
	TLO.New.getNode()->dump(&DAG);
	dbgs() << '\n');

	CommitTargetLoweringOpt(TLO);
	return true;
	}

	void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode Load, SDNode ExtLoad) {
	SDLoc DL(Load);
	EVT VT = Load->getValueType(0);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));

	DEBUG(dbgs() << "\nReplacing.9 ";
	Load->dump(&DAG);
	dbgs() << "\nWith: ";
	Trunc.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
	deleteAndRecombine(Load);
	AddToWorklist(Trunc.getNode());
	}

	SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
	Replace = false;
	SDLoc DL(Op);
	if (ISD::isUNINDEXEDLoad(Op.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	EVT MemVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
	? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
	: ISD::EXTLOAD)
	: LD->getExtensionType();
	Replace = true;
	return DAG.getExtLoad(ExtType, DL, PVT,
	LD->getChain(), LD->getBasePtr(),
	MemVT, LD->getMemOperand());
	}

	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	default: break;
	case ISD::AssertSext:
	if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
	return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
	break;
	case ISD::AssertZext:
	if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
	return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
	break;
	case ISD::Constant: {
	unsigned ExtOpc =
	Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, DL, PVT, Op);
	}
	}

	if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
	return SDValue();
	return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
	}

	SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
	if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
	return SDValue();
	EVT OldVT = Op.getValueType();
	SDLoc DL(Op);
	bool Replace = false;
	SDValue NewOp = PromoteOperand(Op, PVT, Replace);
	if (!NewOp.getNode())
	return SDValue();
	AddToWorklist(NewOp.getNode());

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
	DAG.getValueType(OldVT));
	}

	SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
	EVT OldVT = Op.getValueType();
	SDLoc DL(Op);
	bool Replace = false;
	SDValue NewOp = PromoteOperand(Op, PVT, Replace);
	if (!NewOp.getNode())
	return SDValue();
	AddToWorklist(NewOp.getNode());

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
	return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
	}

	/// Promote the specified integer binary operation if the target indicates it is
	/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
	/// i32 since i16 instructions are longer.
	SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));

	bool Replace0 = false;
	SDValue N0 = Op.getOperand(0);
	SDValue NN0 = PromoteOperand(N0, PVT, Replace0);

	bool Replace1 = false;
	SDValue N1 = Op.getOperand(1);
	SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
	SDLoc DL(Op);

	SDValue RV =
	DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));

	// We are always replacing N0/N1's use in N and only need
	// additional replacements if there are additional uses.
	Replace0 &= !N0->hasOneUse();
	Replace1 &= (N0 != N1) && !N1->hasOneUse();

	// Combine Op here so it is preserved past replacements.
	CombineTo(Op.getNode(), RV);

	// If operands have a use ordering, make sure we deal with
	// predecessor first.
	if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
	std::swap(N0, N1);
	std::swap(NN0, NN1);
	}

	if (Replace0) {
	AddToWorklist(NN0.getNode());
	ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
	}
	if (Replace1) {
	AddToWorklist(NN1.getNode());
	ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
	}
	return Op;
	}
	return SDValue();
	}

	/// Promote the specified integer shift operation if the target indicates it is
	/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
	/// i32 since i16 instructions are longer.
	SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));

	bool Replace = false;
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (Opc == ISD::SRA)
	N0 = SExtPromoteOperand(N0, PVT);
	else if (Opc == ISD::SRL)
	N0 = ZExtPromoteOperand(N0, PVT);
	else
	N0 = PromoteOperand(N0, PVT, Replace);

	if (!N0.getNode())
	return SDValue();

	SDLoc DL(Op);
	SDValue RV =
	DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));

	AddToWorklist(N0.getNode());
	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());

	// Deal with Op being deleted.
	if (Op && Op.getOpcode() != ISD::DELETED_NODE)
	return RV;
	}
	return SDValue();
	}

	SDValue DAGCombiner::PromoteExtend(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");
	// fold (aext (aext x)) -> (aext x)
	// fold (aext (zext x)) -> (zext x)
	// fold (aext (sext x)) -> (sext x)
	DEBUG(dbgs() << "\nPromoting ";
	Op.getNode()->dump(&DAG));
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
	}
	return SDValue();
	}

	bool DAGCombiner::PromoteLoad(SDValue Op) {
	if (!LegalOperations)
	return false;

	if (!ISD::isUNINDEXEDLoad(Op.getNode()))
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return false;

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return false;

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	SDLoc DL(Op);
	SDNode *N = Op.getNode();
	LoadSDNode *LD = cast<LoadSDNode>(N);
	EVT MemVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
	? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
	: ISD::EXTLOAD)
	: LD->getExtensionType();
	SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
	LD->getChain(), LD->getBasePtr(),
	MemVT, LD->getMemOperand());
	SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);

	DEBUG(dbgs() << "\nPromoting ";
	N->dump(&DAG);
	dbgs() << "\nTo: ";
	Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
	deleteAndRecombine(N);
	AddToWorklist(Result.getNode());
	return true;
	}
	return false;
	}

	/// \brief Recursively delete a node which has no uses and any operands for
	/// which it is the only use.
	///
	/// Note that this both deletes the nodes and removes them from the worklist.
	/// It also adds any nodes who have had a user deleted to the worklist as they
	/// may now have only one use and subject to other combines.
	bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
	if (!N->use_empty())
	return false;

	SmallSetVector<SDNode *, 16> Nodes;
	Nodes.insert(N);
	do {
	N = Nodes.pop_back_val();
	if (!N)
	continue;

	if (N->use_empty()) {
	for (const SDValue &ChildN : N->op_values())
	Nodes.insert(ChildN.getNode());

	removeFromWorklist(N);
	DAG.DeleteNode(N);
	} else {
	AddToWorklist(N);
	}
	} while (!Nodes.empty());
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Main DAG Combiner implementation
	//===----------------------------------------------------------------------===//

	void DAGCombiner::Run(CombineLevel AtLevel) {
	// set the instance variables, so that the various visit routines may use it.
	Level = AtLevel;
	LegalOperations = Level >= AfterLegalizeVectorOps;
	LegalTypes = Level >= AfterLegalizeTypes;

	// Add all the dag nodes to the worklist.
	for (SDNode &Node : DAG.allnodes())
	AddToWorklist(&Node);

	// Create a dummy node (which is not added to allnodes), that adds a reference
	// to the root node, preventing it from being deleted, and tracking any
	// changes of the root.
	HandleSDNode Dummy(DAG.getRoot());

	// While the worklist isn't empty, find a node and try to combine it.
	while (!WorklistMap.empty()) {
	SDNode *N;
	// The Worklist holds the SDNodes in order, but it may contain null entries.
	do {
	N = Worklist.pop_back_val();
	} while (!N);

	bool GoodWorklistEntry = WorklistMap.erase(N);
	(void)GoodWorklistEntry;
	assert(GoodWorklistEntry &&
	"Found a worklist entry without a corresponding map entry!");

	// If N has no uses, it is dead. Make sure to revisit all N's operands once
	// N is deleted from the DAG, since they too may now be dead or may have a
	// reduced number of uses, allowing other xforms.
	if (recursivelyDeleteUnusedNodes(N))
	continue;

	WorklistRemover DeadNodes(*this);

	// If this combine is running after legalizing the DAG, re-legalize any
	// nodes pulled off the worklist.
	if (Level == AfterLegalizeDAG) {
	SmallSetVector<SDNode *, 16> UpdatedNodes;
	bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);

	for (SDNode *LN : UpdatedNodes) {
	AddToWorklist(LN);
	AddUsersToWorklist(LN);
	}
	if (!NIsValid)
	continue;
	}

	DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));

	// Add any operands of the new node which have not yet been combined to the
	// worklist as well. Because the worklist uniques things already, this
	// won't repeatedly process the same operand.
	CombinedNodes.insert(N);
	for (const SDValue &ChildN : N->op_values())
	if (!CombinedNodes.count(ChildN.getNode()))
	AddToWorklist(ChildN.getNode());

	SDValue RV = combine(N);

	if (!RV.getNode())
	continue;

	++NodesCombined;

	// If we get back the same node we passed in, rather than a new node or
	// zero, we know that the node must have defined multiple values and
	// CombineTo was used. Since CombineTo takes care of the worklist
	// mechanics for us, we have no work to do in this case.
	if (RV.getNode() == N)
	continue;

	assert(N->getOpcode() != ISD::DELETED_NODE &&
	RV.getOpcode() != ISD::DELETED_NODE &&
	"Node was deleted but visit returned new node!");

	DEBUG(dbgs() << " ... into: ";
	RV.getNode()->dump(&DAG));

	if (N->getNumValues() == RV.getNode()->getNumValues())
	DAG.ReplaceAllUsesWith(N, RV.getNode());
	else {
	assert(N->getValueType(0) == RV.getValueType() &&
	N->getNumValues() == 1 && "Type mismatch");
	DAG.ReplaceAllUsesWith(N, &RV);
	}

	// Push the new node and any users onto the worklist
	AddToWorklist(RV.getNode());
	AddUsersToWorklist(RV.getNode());

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node. This will also take care of adding any
	// operands which have lost a user to the worklist.
	recursivelyDeleteUnusedNodes(N);
	}

	// If the root changed (e.g. it was a dead load, update the root).
	DAG.setRoot(Dummy.getValue());
	DAG.RemoveDeadNodes();
	}

	SDValue DAGCombiner::visit(SDNode *N) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::TokenFactor: return visitTokenFactor(N);
	case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
	case ISD::ADD: return visitADD(N);
	case ISD::SUB: return visitSUB(N);
	case ISD::ADDC: return visitADDC(N);
	case ISD::UADDO: return visitUADDO(N);
	case ISD::SUBC: return visitSUBC(N);
	case ISD::USUBO: return visitUSUBO(N);
	case ISD::ADDE: return visitADDE(N);
	case ISD::ADDCARRY: return visitADDCARRY(N);
	case ISD::SUBE: return visitSUBE(N);
	case ISD::SUBCARRY: return visitSUBCARRY(N);
	case ISD::MUL: return visitMUL(N);
	case ISD::SDIV: return visitSDIV(N);
	case ISD::UDIV: return visitUDIV(N);
	case ISD::SREM:
	case ISD::UREM: return visitREM(N);
	case ISD::MULHU: return visitMULHU(N);
	case ISD::MULHS: return visitMULHS(N);
	case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
	case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
	case ISD::SMULO: return visitSMULO(N);
	case ISD::UMULO: return visitUMULO(N);
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX: return visitIMINMAX(N);
	case ISD::AND: return visitAND(N);
	case ISD::OR: return visitOR(N);
	case ISD::XOR: return visitXOR(N);
	case ISD::SHL: return visitSHL(N);
	case ISD::SRA: return visitSRA(N);
	case ISD::SRL: return visitSRL(N);
	case ISD::ROTR:
	case ISD::ROTL: return visitRotate(N);
	case ISD::ABS: return visitABS(N);
	case ISD::BSWAP: return visitBSWAP(N);
	case ISD::BITREVERSE: return visitBITREVERSE(N);
	case ISD::CTLZ: return visitCTLZ(N);
	case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N);
	case ISD::CTTZ: return visitCTTZ(N);
	case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N);
	case ISD::CTPOP: return visitCTPOP(N);
	case ISD::SELECT: return visitSELECT(N);
	case ISD::VSELECT: return visitVSELECT(N);
	case ISD::SELECT_CC: return visitSELECT_CC(N);
	case ISD::SETCC: return visitSETCC(N);
	case ISD::SETCCE: return visitSETCCE(N);
	case ISD::SETCCCARRY: return visitSETCCCARRY(N);
	case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
	case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
	case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
	case ISD::AssertSext:
	case ISD::AssertZext: return visitAssertExt(N);
	case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
	case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
	case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
	case ISD::TRUNCATE: return visitTRUNCATE(N);
	case ISD::BITCAST: return visitBITCAST(N);
	case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
	case ISD::FADD: return visitFADD(N);
	case ISD::FSUB: return visitFSUB(N);
	case ISD::FMUL: return visitFMUL(N);
	case ISD::FMA: return visitFMA(N);
	case ISD::FDIV: return visitFDIV(N);
	case ISD::FREM: return visitFREM(N);
	case ISD::FSQRT: return visitFSQRT(N);
	case ISD::FCOPYSIGN: return visitFCOPYSIGN(N);
	case ISD::SINT_TO_FP: return visitSINT_TO_FP(N);
	case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
	case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
	case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
	case ISD::FP_ROUND: return visitFP_ROUND(N);
	case ISD::FP_ROUND_INREG: return visitFP_ROUND_INREG(N);
	case ISD::FP_EXTEND: return visitFP_EXTEND(N);
	case ISD::FNEG: return visitFNEG(N);
	case ISD::FABS: return visitFABS(N);
	case ISD::FFLOOR: return visitFFLOOR(N);
	case ISD::FMINNUM: return visitFMINNUM(N);
	case ISD::FMAXNUM: return visitFMAXNUM(N);
	case ISD::FCEIL: return visitFCEIL(N);
	case ISD::FTRUNC: return visitFTRUNC(N);
	case ISD::BRCOND: return visitBRCOND(N);
	case ISD::BR_CC: return visitBR_CC(N);
	case ISD::LOAD: return visitLOAD(N);
	case ISD::STORE: return visitSTORE(N);
	case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N);
	case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
	case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
	case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N);
	case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
	case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
	case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N);
	case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
	case ISD::MGATHER: return visitMGATHER(N);
	case ISD::MLOAD: return visitMLOAD(N);
	case ISD::MSCATTER: return visitMSCATTER(N);
	case ISD::MSTORE: return visitMSTORE(N);
	case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
	case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);
	}
	return SDValue();
	}

	SDValue DAGCombiner::combine(SDNode *N) {
	SDValue RV = visit(N);

	// If nothing happened, try a target-specific DAG combine.
	if (!RV.getNode()) {
	assert(N->getOpcode() != ISD::DELETED_NODE &&
	"Node was deleted but visit returned NULL!");

	if (N->getOpcode() >= ISD::BUILTIN_OP_END \|\|
	TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {

	// Expose the DAG combiner to the target combiner impls.
	TargetLowering::DAGCombinerInfo
	DagCombineInfo(DAG, Level, false, this);

	RV = TLI.PerformDAGCombine(N, DagCombineInfo);
	}
	}

	// If nothing happened still, try promoting the operation.
	if (!RV.getNode()) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	RV = PromoteIntBinOp(SDValue(N, 0));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	RV = PromoteIntShiftOp(SDValue(N, 0));
	break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	RV = PromoteExtend(SDValue(N, 0));
	break;
	case ISD::LOAD:
	if (PromoteLoad(SDValue(N, 0)))
	RV = SDValue(N, 0);
	break;
	}
	}

	// If N is a commutative binary node, try eliminate it if the commuted
	// version is already present in the DAG.
	if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
	N->getNumValues() == 1) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Constant operands are canonicalized to RHS.
	if (N0 != N1 && (isa<ConstantSDNode>(N0) \|\| !isa<ConstantSDNode>(N1))) {
	SDValue Ops[] = {N1, N0};
	SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
	N->getFlags());
	if (CSENode)
	return SDValue(CSENode, 0);
	}
	}

	return RV;
	}

	/// Given a node, return its input chain if it has one, otherwise return a null
	/// sd operand.
	static SDValue getInputChainForNode(SDNode *N) {
	if (unsigned NumOps = N->getNumOperands()) {
	if (N->getOperand(0).getValueType() == MVT::Other)
	return N->getOperand(0);
	if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
	return N->getOperand(NumOps-1);
	for (unsigned i = 1; i < NumOps-1; ++i)
	if (N->getOperand(i).getValueType() == MVT::Other)
	return N->getOperand(i);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
	// If N has two operands, where one has an input chain equal to the other,
	// the 'other' chain is redundant.
	if (N->getNumOperands() == 2) {
	if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
	return N->getOperand(0);
	if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
	return N->getOperand(1);
	}

	SmallVector<SDNode *, 8> TFs; // List of token factors to visit.
	SmallVector<SDValue, 8> Ops; // Ops for replacing token factor.
	SmallPtrSet<SDNode*, 16> SeenOps;
	bool Changed = false; // If we should replace this token factor.

	// Start out with this token factor.
	TFs.push_back(N);

	// Iterate through token factors. The TFs grows when new token factors are
	// encountered.
	for (unsigned i = 0; i < TFs.size(); ++i) {
	SDNode *TF = TFs[i];

	// Check each of the operands.
	for (const SDValue &Op : TF->op_values()) {
	switch (Op.getOpcode()) {
	case ISD::EntryToken:
	// Entry tokens don't need to be added to the list. They are
	// redundant.
	Changed = true;
	break;

	case ISD::TokenFactor:
	if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
	// Queue up for processing.
	TFs.push_back(Op.getNode());
	// Clean up in case the token factor is removed.
	AddToWorklist(Op.getNode());
	Changed = true;
	break;
	}
	LLVM_FALLTHROUGH;

	default:
	// Only add if it isn't already in the list.
	if (SeenOps.insert(Op.getNode()).second)
	Ops.push_back(Op);
	else
	Changed = true;
	break;
	}
	}
	}

	// Remove Nodes that are chained to another node in the list. Do so
	// by walking up chains breath-first stopping when we've seen
	// another operand. In general we must climb to the EntryNode, but we can exit
	// early if we find all remaining work is associated with just one operand as
	// no further pruning is possible.

	// List of nodes to search through and original Ops from which they originate.
	SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
	SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
	SmallPtrSet<SDNode *, 16> SeenChains;
	bool DidPruneOps = false;

	unsigned NumLeftToConsider = 0;
	for (const SDValue &Op : Ops) {
	Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
	OpWorkCount.push_back(1);
	}

	auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
	// If this is an Op, we can remove the op from the list. Remark any
	// search associated with it as from the current OpNumber.
	if (SeenOps.count(Op) != 0) {
	Changed = true;
	DidPruneOps = true;
	unsigned OrigOpNumber = 0;
	while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
	OrigOpNumber++;
	assert((OrigOpNumber != Ops.size()) &&
	"expected to find TokenFactor Operand");
	// Re-mark worklist from OrigOpNumber to OpNumber
	for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
	if (Worklist[i].second == OrigOpNumber) {
	Worklist[i].second = OpNumber;
	}
	}
	OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
	OpWorkCount[OrigOpNumber] = 0;
	NumLeftToConsider--;
	}
	// Add if it's a new chain
	if (SeenChains.insert(Op).second) {
	OpWorkCount[OpNumber]++;
	Worklist.push_back(std::make_pair(Op, OpNumber));
	}
	};

	for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
	// We need at least be consider at least 2 Ops to prune.
	if (NumLeftToConsider <= 1)
	break;
	auto CurNode = Worklist[i].first;
	auto CurOpNumber = Worklist[i].second;
	assert((OpWorkCount[CurOpNumber] > 0) &&
	"Node should not appear in worklist");
	switch (CurNode->getOpcode()) {
	case ISD::EntryToken:
	// Hitting EntryToken is the only way for the search to terminate without
	// hitting
	// another operand's search. Prevent us from marking this operand
	// considered.
	NumLeftToConsider++;
	break;
	case ISD::TokenFactor:
	for (const SDValue &Op : CurNode->op_values())
	AddToWorklist(i, Op.getNode(), CurOpNumber);
	break;
	case ISD::CopyFromReg:
	case ISD::CopyToReg:
	AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
	break;
	default:
	if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
	AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
	break;
	}
	OpWorkCount[CurOpNumber]--;
	if (OpWorkCount[CurOpNumber] == 0)
	NumLeftToConsider--;
	}

	// If we've changed things around then replace token factor.
	if (Changed) {
	SDValue Result;
	if (Ops.empty()) {
	// The entry token is the only possible outcome.
	Result = DAG.getEntryNode();
	} else {
	if (DidPruneOps) {
	SmallVector<SDValue, 8> PrunedOps;
	//
	for (const SDValue &Op : Ops) {
	if (SeenChains.count(Op.getNode()) == 0)
	PrunedOps.push_back(Op);
	}
	Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
	} else {
	Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
	}
	}
	return Result;
	}
	return SDValue();
	}

	/// MERGE_VALUES can always be eliminated.
	SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
	WorklistRemover DeadNodes(*this);
	// Replacing results may cause a different MERGE_VALUES to suddenly
	// be CSE'd with N, and carry its uses with it. Iterate until no
	// uses remain, to ensure that the node can be safely deleted.
	// First add the users of this node to the work list so that they
	// can be tried again once they have new operands.
	AddUsersToWorklist(N);
	do {
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
	} while (!N->use_empty());
	deleteAndRecombine(N);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
	/// ConstantSDNode pointer else nullptr.
	static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
	return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
	}

	SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
	auto BinOpcode = BO->getOpcode();
	assert((BinOpcode == ISD::ADD \|\| BinOpcode == ISD::SUB \|\|
	BinOpcode == ISD::MUL \|\| BinOpcode == ISD::SDIV \|\|
	BinOpcode == ISD::UDIV \|\| BinOpcode == ISD::SREM \|\|
	BinOpcode == ISD::UREM \|\| BinOpcode == ISD::AND \|\|
	BinOpcode == ISD::OR \|\| BinOpcode == ISD::XOR \|\|
	BinOpcode == ISD::SHL \|\| BinOpcode == ISD::SRL \|\|
	BinOpcode == ISD::SRA \|\| BinOpcode == ISD::FADD \|\|
	BinOpcode == ISD::FSUB \|\| BinOpcode == ISD::FMUL \|\|
	BinOpcode == ISD::FDIV \|\| BinOpcode == ISD::FREM) &&
	"Unexpected binary operator");

	// Bail out if any constants are opaque because we can't constant fold those.
	SDValue C1 = BO->getOperand(1);
	if (!isConstantOrConstantVector(C1, true) &&
	!isConstantFPBuildVectorOrConstantFP(C1))
	return SDValue();

	// Don't do this unless the old select is going away. We want to eliminate the
	// binary operator, not replace a binop with a select.
	// TODO: Handle ISD::SELECT_CC.
	SDValue Sel = BO->getOperand(0);
	if (Sel.getOpcode() != ISD::SELECT \|\| !Sel.hasOneUse())
	return SDValue();

	SDValue CT = Sel.getOperand(1);
	if (!isConstantOrConstantVector(CT, true) &&
	!isConstantFPBuildVectorOrConstantFP(CT))
	return SDValue();

	SDValue CF = Sel.getOperand(2);
	if (!isConstantOrConstantVector(CF, true) &&
	!isConstantFPBuildVectorOrConstantFP(CF))
	return SDValue();

	// We have a select-of-constants followed by a binary operator with a
	// constant. Eliminate the binop by pulling the constant math into the select.
	// Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1
	EVT VT = Sel.getValueType();
	SDLoc DL(Sel);
	SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1);
	if (!NewCT.isUndef() &&
	!isConstantOrConstantVector(NewCT, true) &&
	!isConstantFPBuildVectorOrConstantFP(NewCT))
	return SDValue();

	SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1);
	if (!NewCF.isUndef() &&
	!isConstantOrConstantVector(NewCF, true) &&
	!isConstantFPBuildVectorOrConstantFP(NewCF))
	return SDValue();

	return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
	}

	SDValue DAGCombiner::visitADD(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (add x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	}

	// fold (add x, undef) -> undef
	if (N0.isUndef())
	return N0;

	if (N1.isUndef())
	return N1;

	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	// canonicalize constant to RHS
	if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
	// fold (add c1, c2) -> c1+c2
	return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(),
	N1.getNode());
	}

	// fold (add x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
	// fold ((c1-A)+c2) -> (c1+c2)-A
	if (N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
	// FIXME: Adding 2 constants should be handled by FoldConstantArithmetic.
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
	N0.getOperand(1));
	}

	// add (sext i1 X), 1 -> zext (not i1 X)
	// We don't transform this pattern:
	// add (zext i1 X), -1 -> sext (not i1 X)
	// because most (?) targets generate better code for the zext form.
	if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
	isOneConstantOrOneSplatConstant(N1)) {
	SDValue X = N0.getOperand(0);
	if ((!LegalOperations \|\|
	(TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
	TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
	X.getScalarValueSizeInBits() == 1) {
	SDValue Not = DAG.getNOT(DL, X, X.getValueType());
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
	}
	}

	// Undo the add -> or combine to merge constant offsets from a frame index.
	if (N0.getOpcode() == ISD::OR &&
	isa<FrameIndexSDNode>(N0.getOperand(0)) &&
	isa<ConstantSDNode>(N0.getOperand(1)) &&
	DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
	SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
	}
	}

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate add
	if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))
	return RADD;

	// fold ((0-A) + B) -> B-A
	if (N0.getOpcode() == ISD::SUB &&
	isNullConstantOrNullSplatConstant(N0.getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));

	// fold (A + (0-B)) -> A-B
	if (N1.getOpcode() == ISD::SUB &&
	isNullConstantOrNullSplatConstant(N1.getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));

	// fold (A+(B-A)) -> B
	if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
	return N1.getOperand(0);

	// fold ((B-A)+A) -> B
	if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
	return N0.getOperand(0);

	// fold (A+(B-(A+C))) to (B-C)
	if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
	N0 == N1.getOperand(1).getOperand(0))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N1.getOperand(1).getOperand(1));

	// fold (A+(B-(C+A))) to (B-C)
	if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
	N0 == N1.getOperand(1).getOperand(1))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N1.getOperand(1).getOperand(0));

	// fold (A+((B-A)+or-C)) to (B+or-C)
	if ((N1.getOpcode() == ISD::SUB \|\| N1.getOpcode() == ISD::ADD) &&
	N1.getOperand(0).getOpcode() == ISD::SUB &&
	N0 == N1.getOperand(0).getOperand(1))
	return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
	N1.getOperand(1));

	// fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
	if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	if (isConstantOrConstantVector(N00) \|\| isConstantOrConstantVector(N10))
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
	DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
	}

	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (a+b) -> (a\|b) iff a and b share no bits.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::OR, VT)) &&
	DAG.haveNoCommonBitsSet(N0, N1))
	return DAG.getNode(ISD::OR, DL, VT, N0, N1);

	if (SDValue Combined = visitADDLike(N0, N1, N))
	return Combined;

	if (SDValue Combined = visitADDLike(N1, N0, N))
	return Combined;

	return SDValue();
	}

	static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
	bool Masked = false;

	// First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
	while (true) {
	if (V.getOpcode() == ISD::TRUNCATE \|\| V.getOpcode() == ISD::ZERO_EXTEND) {
	V = V.getOperand(0);
	continue;
	}

	if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
	Masked = true;
	V = V.getOperand(0);
	continue;
	}

	break;
	}

	// If this is not a carry, return.
	if (V.getResNo() != 1)
	return SDValue();

	if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
	V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
	return SDValue();

	// If the result is masked, then no matter what kind of bool it is we can
	// return. If it isn't, then we need to make sure the bool type is either 0 or
	// 1 and not other values.
	if (Masked \|\|
	TLI.getBooleanContents(V.getValueType()) ==
	TargetLoweringBase::ZeroOrOneBooleanContent)
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) {
	EVT VT = N0.getValueType();
	SDLoc DL(LocReference);

	// fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
	if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
	isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N0,
	DAG.getNode(ISD::SHL, DL, VT,
	N1.getOperand(0).getOperand(1),
	N1.getOperand(1)));

	if (N1.getOpcode() == ISD::AND) {
	SDValue AndOp0 = N1.getOperand(0);
	unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0);
	unsigned DestBits = VT.getScalarSizeInBits();

	// (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
	// and similar xforms where the inner op is either ~0 or 0.
	if (NumSignBits == DestBits &&
	isOneConstantOrOneSplatConstant(N1->getOperand(1)))
	return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
	}

	// add (sext i1), X -> sub X, (zext i1)
	if (N0.getOpcode() == ISD::SIGN_EXTEND &&
	N0.getOperand(0).getValueType() == MVT::i1 &&
	!TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) {
	SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
	}

	// add X, (sextinreg Y i1) -> sub X, (and Y 1)
	if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
	if (TN->getVT() == MVT::i1) {
	SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
	DAG.getConstant(1, DL, VT));
	return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
	}
	}

	// (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
	if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
	N1.getResNo() == 0)
	return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
	N0, N1.getOperand(0), N1.getOperand(2));

	// (add X, Carry) -> (addcarry X, 0, Carry)
	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
	if (SDValue Carry = getAsCarry(TLI, N1))
	return DAG.getNode(ISD::ADDCARRY, DL,
	DAG.getVTList(VT, Carry.getValueType()), N0,
	DAG.getConstant(0, DL, VT), Carry);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// If the flag result is dead, turn this into an ADD.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// canonicalize constant to RHS.
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);

	// fold (addc x, 0) -> x + no carry out
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
	DL, MVT::Glue));

	// If it cannot overflow, transform into an add.
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	return SDValue();
	}

	SDValue DAGCombiner::visitUADDO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	if (VT.isVector())
	return SDValue();

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// If the flag result is dead, turn this into an ADD.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getUNDEF(CarryVT));

	// canonicalize constant to RHS.
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);

	// fold (uaddo x, 0) -> x + no carry out
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));

	// If it cannot overflow, transform into an add.
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getConstant(0, DL, CarryVT));

	if (SDValue Combined = visitUADDOLike(N0, N1, N))
	return Combined;

	if (SDValue Combined = visitUADDOLike(N1, N0, N))
	return Combined;

	return SDValue();
	}

	SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
	auto VT = N0.getValueType();

	// (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
	// If Y + 1 cannot overflow.
	if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
	SDValue Y = N1.getOperand(0);
	SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
	if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
	N1.getOperand(2));
	}

	// (uaddo X, Carry) -> (addcarry X, 0, Carry)
	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
	if (SDValue Carry = getAsCarry(TLI, N1))
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
	DAG.getConstant(0, SDLoc(N), VT), Carry);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// canonicalize constant to RHS
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
	N1, N0, CarryIn);

	// fold (adde x, y, false) -> (addc x, y)
	if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);
	SDLoc DL(N);

	// canonicalize constant to RHS
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);

	// fold (addcarry x, y, false) -> (uaddo x, y)
	if (isNullConstant(CarryIn))
	return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);

	// fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
	if (isNullConstant(N0) && isNullConstant(N1)) {
	EVT VT = N0.getValueType();
	EVT CarryVT = CarryIn.getValueType();
	SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
	AddToWorklist(CarryExt.getNode());
	return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
	DAG.getConstant(1, DL, VT)),
	DAG.getConstant(0, DL, CarryVT));
	}

	if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
	return Combined;

	if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
	return Combined;

	return SDValue();
	}

	SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
	SDNode *N) {
	// Iff the flag result is dead:
	// (addcarry (add\|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
	if ((N0.getOpcode() == ISD::ADD \|\|
	(N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) &&
	isNullConstant(N1) && !N->hasAnyUseOfValue(1))
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
	N0.getOperand(0), N0.getOperand(1), CarryIn);

	/**
	* When one of the addcarry argument is itself a carry, we may be facing
	* a diamond carry propagation. In which case we try to transform the DAG
	* to ensure linear carry propagation if that is possible.
	*
	* We are trying to get:
	* (addcarry X, 0, (addcarry A, B, Z):Carry)
	*/
	if (auto Y = getAsCarry(TLI, N1)) {
	/**
	* (uaddo A, B)
	* / \
	* Carry Sum
	* \| \
	* \| (addcarry *, 0, Z)
	* \| /
	* \ Carry
	* \| /
	* (addcarry X, , )
	*/
	if (Y.getOpcode() == ISD::UADDO &&
	CarryIn.getResNo() == 1 &&
	CarryIn.getOpcode() == ISD::ADDCARRY &&
	isNullConstant(CarryIn.getOperand(1)) &&
	CarryIn.getOperand(0) == Y.getValue(0)) {
	auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(),
	Y.getOperand(0), Y.getOperand(1),
	CarryIn.getOperand(2));
	AddToWorklist(NewY.getNode());
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
	DAG.getConstant(0, SDLoc(N), N0.getValueType()),
	NewY.getValue(1));
	}
	}

	return SDValue();
	}

	// Since it may not be valid to emit a fold to zero for vector initializers
	// check if we can before folding.
	static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
	SelectionDAG &DAG, bool LegalOperations,
	bool LegalTypes) {
	if (!VT.isVector())
	return DAG.getConstant(0, DL, VT);
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
	return DAG.getConstant(0, DL, VT);
	return SDValue();
	}

	SDValue DAGCombiner::visitSUB(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (sub x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (sub x, x) -> 0
	// FIXME: Refactor this and xor and other similar operations together.
	if (N0 == N1)
	return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes);
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
	// fold (sub c1, c2) -> c1-c2
	return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),
	N1.getNode());
	}

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);

	// fold (sub x, c) -> (add x, -c)
	if (N1C) {
	return DAG.getNode(ISD::ADD, DL, VT, N0,
	DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
	}

	if (isNullConstantOrNullSplatConstant(N0)) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	// Right-shifting everything out but the sign bit followed by negation is
	// the same as flipping arithmetic/logical shift type without the negation:
	// -(X >>u 31) -> (X >>s 31)
	// -(X >>s 31) -> (X >>u 31)
	if (N1->getOpcode() == ISD::SRA \|\| N1->getOpcode() == ISD::SRL) {
	ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
	if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) {
	auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
	if (!LegalOperations \|\| TLI.isOperationLegal(NewSh, VT))
	return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
	}
	}

	// 0 - X --> 0 if the sub is NUW.
	if (N->getFlags().hasNoUnsignedWrap())
	return N0;

	if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
	// N1 is either 0 or the minimum signed value. If the sub is NSW, then
	// N1 must be 0 because negating the minimum signed value is undefined.
	if (N->getFlags().hasNoSignedWrap())
	return N0;

	// 0 - X --> X if X is 0 or the minimum signed value.
	return N1;
	}
	}

	// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
	if (isAllOnesConstantOrAllOnesSplatConstant(N0))
	return DAG.getNode(ISD::XOR, DL, VT, N1, N0);

	// fold A-(A-B) -> B
	if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
	return N1.getOperand(1);

	// fold (A+B)-A -> B
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
	return N0.getOperand(1);

	// fold (A+B)-B -> A
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
	return N0.getOperand(0);

	// fold C2-(A+C1) -> (C2-C1)-A
	if (N1.getOpcode() == ISD::ADD) {
	SDValue N11 = N1.getOperand(1);
	if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
	SDValue NewC = DAG.getNode(ISD::SUB, DL, VT, N0, N11);
	return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
	}
	}

	// fold ((A+(B+or-C))-B) -> A+or-C
	if (N0.getOpcode() == ISD::ADD &&
	(N0.getOperand(1).getOpcode() == ISD::SUB \|\|
	N0.getOperand(1).getOpcode() == ISD::ADD) &&
	N0.getOperand(1).getOperand(0) == N1)
	return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(1));

	// fold ((A+(C+B))-B) -> A+C
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
	N0.getOperand(1).getOperand(1) == N1)
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(0));

	// fold ((A-(B-C))-C) -> A-B
	if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
	N0.getOperand(1).getOperand(1) == N1)
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(0));

	// If either operand of a sub is undef, the result is undef
	if (N0.isUndef())
	return N0;
	if (N1.isUndef())
	return N1;

	// If the relocation model supports it, consider symbol offsets.
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
	if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
	// fold (sub Sym, c) -> Sym-c
	if (N1C && GA->getOpcode() == ISD::GlobalAddress)
	return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
	GA->getOffset() -
	(uint64_t)N1C->getSExtValue());
	// fold (sub Sym+c1, Sym+c2) -> c1-c2
	if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
	if (GA->getGlobal() == GB->getGlobal())
	return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
	DL, VT);
	}

	// sub X, (sextinreg Y i1) -> add X, (and Y 1)
	if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
	if (TN->getVT() == MVT::i1) {
	SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
	DAG.getConstant(1, DL, VT));
	return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// If the flag result is dead, turn this into an SUB.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// fold (subc x, x) -> 0 + no borrow
	if (N0 == N1)
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// fold (subc x, 0) -> x + no borrow
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
	if (isAllOnesConstant(N0))
	return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	return SDValue();
	}

	SDValue DAGCombiner::visitUSUBO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	if (VT.isVector())
	return SDValue();

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// If the flag result is dead, turn this into an SUB.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
	DAG.getUNDEF(CarryVT));

	// fold (usubo x, x) -> 0 + no borrow
	if (N0 == N1)
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getConstant(0, DL, CarryVT));

	// fold (usubo x, 0) -> x + no borrow
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));

	// Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
	if (isAllOnesConstant(N0))
	return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
	DAG.getConstant(0, DL, CarryVT));

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// fold (sube x, y, false) -> (subc x, y)
	if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// fold (subcarry x, y, false) -> (usubo x, y)
	if (isNullConstant(CarryIn))
	return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitMUL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold (mul x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	bool N0IsConst = false;
	bool N1IsConst = false;
	bool N1IsOpaqueConst = false;
	bool N0IsOpaqueConst = false;
	APInt ConstValue0, ConstValue1;
	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
	N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
	assert((!N0IsConst \|\|
	ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) &&
	"Splat APInt should be element width");
	assert((!N1IsConst \|\|
	ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
	"Splat APInt should be element width");
	} else {
	N0IsConst = isa<ConstantSDNode>(N0);
	if (N0IsConst) {
	ConstValue0 = cast<ConstantSDNode>(N0)->getAPIntValue();
	N0IsOpaqueConst = cast<ConstantSDNode>(N0)->isOpaque();
	}
	N1IsConst = isa<ConstantSDNode>(N1);
	if (N1IsConst) {
	ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
	N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
	}
	}

	// fold (mul c1, c2) -> c1*c2
	if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst)
	return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT,
	N0.getNode(), N1.getNode());

	// canonicalize constant to RHS (vector doesn't have to splat)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
	// fold (mul x, 0) -> 0
	if (N1IsConst && ConstValue1.isNullValue())
	return N1;
	// fold (mul x, 1) -> x
	if (N1IsConst && ConstValue1.isOneValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (mul x, -1) -> 0-x
	if (N1IsConst && ConstValue1.isAllOnesValue()) {
	SDLoc DL(N);
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), N0);
	}
	// fold (mul x, (1 << c)) -> x << c
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1) &&
	(!VT.isVector() \|\| Level <= AfterLegalizeVectorOps)) {
	SDLoc DL(N);
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
	AddToWorklist(Trunc.getNode());
	return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
	}
	// fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
	if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
	unsigned Log2Val = (-ConstValue1).logBase2();
	SDLoc DL(N);
	// FIXME: If the input is something that is easily negated (e.g. a
	// single-use add), we should put the negate there.
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT),
	DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(Log2Val, DL,
	getShiftAmountTy(N0.getValueType()))));
	}

	// (mul (shl X, c1), c2) -> (mul X, c2 << c1)
	if (N0.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
	SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
	if (isConstantOrConstantVector(C3))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
	}

	// Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
	// use.
	{
	SDValue Sh(nullptr, 0), Y(nullptr, 0);

	// Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)).
	if (N0.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N0.getOperand(1)) &&
	N0.getNode()->hasOneUse()) {
	Sh = N0; Y = N1;
	} else if (N1.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N1.getOperand(1)) &&
	N1.getNode()->hasOneUse()) {
	Sh = N1; Y = N0;
	}

	if (Sh.getNode()) {
	SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
	}
	}

	// fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
	N0.getOpcode() == ISD::ADD &&
	DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
	isMulAddWithConstProfitable(N, N0, N1))
	return DAG.getNode(ISD::ADD, SDLoc(N), VT,
	DAG.getNode(ISD::MUL, SDLoc(N0), VT,
	N0.getOperand(0), N1),
	DAG.getNode(ISD::MUL, SDLoc(N1), VT,
	N0.getOperand(1), N1));

	// reassociate mul
	if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1))
	return RMUL;

	return SDValue();
	}

	/// Return true if divmod libcall is available.
	static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
	const TargetLowering &TLI) {
	RTLIB::Libcall LC;
	EVT NodeType = Node->getValueType(0);
	if (!NodeType.isSimple())
	return false;
	switch (NodeType.getSimpleVT().SimpleTy) {
	default: return false; // No libcall for vector types.
	case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
	case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
	case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
	case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
	case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
	}

	return TLI.getLibcallName(LC) != nullptr;
	}

	/// Issue divrem if both quotient and remainder are needed.
	SDValue DAGCombiner::useDivRem(SDNode *Node) {
	if (Node->use_empty())
	return SDValue(); // This is a dead node, leave it alone.

	unsigned Opcode = Node->getOpcode();
	bool isSigned = (Opcode == ISD::SDIV) \|\| (Opcode == ISD::SREM);
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;

	// DivMod lib calls can still work on non-legal types if using lib-calls.
	EVT VT = Node->getValueType(0);
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
	return SDValue();

	// If DIVREM is going to get expanded into a libcall,
	// but there is no libcall available, then don't combine.
	if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
	!isDivRemLibcallAvailable(Node, isSigned, TLI))
	return SDValue();

	// If div is legal, it's better to do the normal expansion
	unsigned OtherOpcode = 0;
	if ((Opcode == ISD::SDIV) \|\| (Opcode == ISD::UDIV)) {
	OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
	if (TLI.isOperationLegalOrCustom(Opcode, VT))
	return SDValue();
	} else {
	OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
	return SDValue();
	}

	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	SDValue combined;
	for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
	UE = Op0.getNode()->use_end(); UI != UE;) {
	SDNode User = UI++;
	if (User == Node \|\| User->use_empty())
	continue;
	// Convert the other matching node(s), too;
	// otherwise, the DIVREM may get target-legalized into something
	// target-specific that we won't be able to recognize.
	unsigned UserOpc = User->getOpcode();
	if ((UserOpc == Opcode \|\| UserOpc == OtherOpcode \|\| UserOpc == DivRemOpc) &&
	User->getOperand(0) == Op0 &&
	User->getOperand(1) == Op1) {
	if (!combined) {
	if (UserOpc == OtherOpcode) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
	} else if (UserOpc == DivRemOpc) {
	combined = SDValue(User, 0);
	} else {
	assert(UserOpc == Opcode);
	continue;
	}
	}
	if (UserOpc == ISD::SDIV \|\| UserOpc == ISD::UDIV)
	CombineTo(User, combined);
	else if (UserOpc == ISD::SREM \|\| UserOpc == ISD::UREM)
	CombineTo(User, combined.getValue(1));
	}
	}
	return combined;
	}

	static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (DAG.isUndef(N->getOpcode(), {N0, N1}))
	return DAG.getUNDEF(VT);

	// undef / X -> 0
	// undef % X -> 0
	if (N0.isUndef())
	return DAG.getConstant(0, DL, VT);

	return SDValue();
	}

	SDValue DAGCombiner::visitSDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	SDLoc DL(N);

	// fold (sdiv c1, c2) -> c1/c2
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);
	// fold (sdiv X, 1) -> X
	if (N1C && N1C->isOne())
	return N0;
	// fold (sdiv X, -1) -> 0-X
	if (N1C && N1C->isAllOnesValue())
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// If we know the sign bits of both operands are zero, strength reduce to a
	// udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
	if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);

	// fold (sdiv X, pow2) -> simple ops after legalize
	// FIXME: We check for the exact bit here because the generic lowering gives
	// better results in that case. The target-specific lowering should learn how
	// to handle exact sdivs efficiently.
	if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
	!N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() \|\|
	(-N1C->getAPIntValue()).isPowerOf2())) {
	// Target-specific implementation of sdiv x, pow2.
	if (SDValue Res = BuildSDIVPow2(N))
	return Res;

	unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();

	// Splat the sign bit into the register
	SDValue SGN =
	DAG.getNode(ISD::SRA, DL, VT, N0,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));
	AddToWorklist(SGN.getNode());

	// Add (N0 < 0) ? abs2 - 1 : 0;
	SDValue SRL =
	DAG.getNode(ISD::SRL, DL, VT, SGN,
	DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL,
	getShiftAmountTy(SGN.getValueType())));
	SDValue ADD = DAG.getNode(ISD::ADD, DL, VT, N0, SRL);
	AddToWorklist(SRL.getNode());
	AddToWorklist(ADD.getNode()); // Divide by pow2
	SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD,
	DAG.getConstant(lg2, DL,
	getShiftAmountTy(ADD.getValueType())));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (N1C->getAPIntValue().isNonNegative())
	return SRA;

	AddToWorklist(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	// If integer divide is expensive and we satisfy the requirements, emit an
	// alternate sequence. Targets may check function attributes for size/speed
	// trade-offs.
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue Op = BuildSDIV(N))
	return Op;

	// sdiv, srem -> sdivrem
	// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
	// true. Otherwise, we break the simplification logic in visitREM().
	if (!N1C \|\| TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue DivRem = useDivRem(N))
	return DivRem;

	return SDValue();
	}

	SDValue DAGCombiner::visitUDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	SDLoc DL(N);

	// fold (udiv c1, c2) -> c1/c2
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C)
	if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
	N0C, N1C))
	return Folded;

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (udiv x, (1 << c)) -> x >>u c
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1)) {
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
	AddToWorklist(Trunc.getNode());
	return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
	}

	// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
	if (N1.getOpcode() == ISD::SHL) {
	SDValue N10 = N1.getOperand(0);
	if (isConstantOrConstantVector(N10, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N10)) {
	SDValue LogBase2 = BuildLogBase2(N10, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ADDVT = N1.getOperand(1).getValueType();
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
	AddToWorklist(Trunc.getNode());
	SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
	}
	}

	// fold (udiv x, c) -> alternate
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue Op = BuildUDIV(N))
	return Op;

	// sdiv, srem -> sdivrem
	// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
	// true. Otherwise, we break the simplification logic in visitREM().
	if (!N1C \|\| TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue DivRem = useDivRem(N))
	return DivRem;

	return SDValue();
	}

	// handles ISD::SREM and ISD::UREM
	SDValue DAGCombiner::visitREM(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	bool isSigned = (Opcode == ISD::SREM);
	SDLoc DL(N);

	// fold (rem c1, c2) -> c1%c2
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C)
	if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
	return Folded;

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (isSigned) {
	// If we know the sign bits of both operands are zero, strength reduce to a
	// urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15
	if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
	} else {
	SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
	if (DAG.isKnownToBeAPowerOfTwo(N1)) {
	// fold (urem x, pow2) -> (and x, pow2-1)
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0, Add);
	}
	if (N1.getOpcode() == ISD::SHL &&
	DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
	// fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0, Add);
	}
	}

	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

	// If X/C can be simplified by the division-by-constant logic, lower
	// X%C to the equivalent of X-X/C*C.
	// To avoid mangling nodes, this simplification requires that the combine()
	// call for the speculative DIV must not cause a DIVREM conversion. We guard
	// against this by skipping the simplification if isIntDivCheap(). When
	// div is not cheap, combine will not return a DIVREM. Regardless,
	// checking cheapness here makes sense since the simplification results in
	// fatter code.
	if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) {
	unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1);
	AddToWorklist(Div.getNode());
	SDValue OptimizedDiv = combine(Div.getNode());
	if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
	assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) &&
	(OptimizedDiv.getOpcode() != ISD::SDIVREM));
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
	AddToWorklist(Mul.getNode());
	return Sub;
	}
	}

	// sdiv, srem -> sdivrem
	if (SDValue DivRem = useDivRem(N))
	return DivRem.getValue(1);

	return SDValue();
	}

	SDValue DAGCombiner::visitMULHS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (VT.isVector()) {
	// fold (mulhs x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N0;
	}

	// fold (mulhs x, 0) -> 0
	if (isNullConstant(N1))
	return N1;
	// fold (mulhs x, 1) -> (sra x, size(x)-1)
	if (isOneConstant(N1))
	return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
	DAG.getConstant(N0.getValueSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));

	// fold (mulhs x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// If the type twice as wide is legal, transform the mulhs to a wider multiply
	// plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
	N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
	N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
	N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(N1.getValueType())));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitMULHU(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (VT.isVector()) {
	// fold (mulhu x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N0;
	}

	// fold (mulhu x, 0) -> 0
	if (isNullConstant(N1))
	return N1;
	// fold (mulhu x, 1) -> 0
	if (isOneConstant(N1))
	return DAG.getConstant(0, DL, N0.getValueType());
	// fold (mulhu x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// If the type twice as wide is legal, transform the mulhu to a wider multiply
	// plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
	N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
	N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(N1.getValueType())));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	}
	}

	return SDValue();
	}

	/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
	/// give the opcodes for the two computations that are being performed. Return
	/// true if a simplification was made.
	SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
	unsigned HiOp) {
	// If the high half is not needed, just compute the low half.
	bool HiExists = N->hasAnyUseOfValue(1);
	if (!HiExists &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
	SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
	return CombineTo(N, Res, Res);
	}

	// If the low half is not needed, just compute the high half.
	bool LoExists = N->hasAnyUseOfValue(0);
	if (!LoExists &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
	SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
	return CombineTo(N, Res, Res);
	}

	// If both halves are used, return as it is.
	if (LoExists && HiExists)
	return SDValue();

	// If the two computed results can be simplified separately, separate them.
	if (LoExists) {
	SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
	AddToWorklist(Lo.getNode());
	SDValue LoOpt = combine(Lo.getNode());
	if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
	return CombineTo(N, LoOpt, LoOpt);
	}

	if (HiExists) {
	SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
	AddToWorklist(Hi.getNode());
	SDValue HiOpt = combine(Hi.getNode());
	if (HiOpt.getNode() && HiOpt != Hi &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
	return CombineTo(N, HiOpt, HiOpt);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
	if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
	return Res;

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// If the type is twice as wide is legal, transform the mulhu to a wider
	// multiply plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
	Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
	// Compute the high part as N1.
	Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(Lo.getValueType())));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
	// Compute the low part as N0.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
	return CombineTo(N, Lo, Hi);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
	if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
	return Res;

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// If the type is twice as wide is legal, transform the mulhu to a wider
	// multiply plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
	Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
	// Compute the high part as N1.
	Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(Lo.getValueType())));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
	// Compute the low part as N0.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
	return CombineTo(N, Lo, Hi);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSMULO(SDNode *N) {
	// (smulo x, 2) -> (saddo x, x)
	if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
	if (C2->getAPIntValue() == 2)
	return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(),
	N->getOperand(0), N->getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitUMULO(SDNode *N) {
	// (umulo x, 2) -> (uaddo x, x)
	if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
	if (C2->getAPIntValue() == 2)
	return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(),
	N->getOperand(0), N->getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold operation with constant operands.
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
	if (N0C && N1C)
	return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);

	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	/// If this is a binary operator with two operands of the same opcode, try to
	/// simplify it.
	SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");

	// Bail early if none of these transforms apply.
	if (N0.getNumOperands() == 0) return SDValue();

	// For each of OP in AND/OR/XOR:
	// fold (OP (zext x), (zext y)) -> (zext (OP x, y))
	// fold (OP (sext x), (sext y)) -> (sext (OP x, y))
	// fold (OP (aext x), (aext y)) -> (aext (OP x, y))
	// fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
	// fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
	//
	// do not sink logical op inside of a vector extend, since it may combine
	// into a vsetcc.
	EVT Op0VT = N0.getOperand(0).getValueType();
	if ((N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND \|\|
	N0.getOpcode() == ISD::BSWAP \|\|
	// Avoid infinite looping with PromoteIntBinOp.
	(N0.getOpcode() == ISD::ANY_EXTEND &&
	(!LegalTypes \|\| TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) \|\|
	(N0.getOpcode() == ISD::TRUNCATE &&
	(!TLI.isZExtFree(VT, Op0VT) \|\|
	!TLI.isTruncateFree(Op0VT, VT)) &&
	TLI.isTypeLegal(Op0VT))) &&
	!VT.isVector() &&
	Op0VT == N1.getOperand(0).getValueType() &&
	(!LegalOperations \|\| TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
	SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
	N0.getOperand(0).getValueType(),
	N0.getOperand(0), N1.getOperand(0));
	AddToWorklist(ORNode.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
	}

	// For each of OP in SHL/SRL/SRA/AND...
	// fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
	// fold (or (OP x, z), (OP y, z)) -> (OP (or x, y), z)
	// fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
	if ((N0.getOpcode() == ISD::SHL \|\| N0.getOpcode() == ISD::SRL \|\|
	N0.getOpcode() == ISD::SRA \|\| N0.getOpcode() == ISD::AND) &&
	N0.getOperand(1) == N1.getOperand(1)) {
	SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
	N0.getOperand(0).getValueType(),
	N0.getOperand(0), N1.getOperand(0));
	AddToWorklist(ORNode.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
	ORNode, N0.getOperand(1));
	}

	// Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
	// Only perform this optimization up until type legalization, before
	// LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
	// adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
	// we don't want to undo this promotion.
	// We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
	// on scalars.
	if ((N0.getOpcode() == ISD::BITCAST \|\|
	N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
	Level <= AfterLegalizeTypes) {
	SDValue In0 = N0.getOperand(0);
	SDValue In1 = N1.getOperand(0);
	EVT In0Ty = In0.getValueType();
	EVT In1Ty = In1.getValueType();
	SDLoc DL(N);
	// If both incoming values are integers, and the original types are the
	// same.
	if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
	SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
	SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
	AddToWorklist(Op.getNode());
	return BC;
	}
	}

	// Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
	// Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
	// If both shuffles use the same mask, and both shuffle within a single
	// vector, then it is worthwhile to move the swizzle after the operation.
	// The type-legalizer generates this pattern when loading illegal
	// vector types from memory. In many cases this allows additional shuffle
	// optimizations.
	// There are other cases where moving the shuffle after the xor/and/or
	// is profitable even if shuffles don't perform a swizzle.
	// If both shuffles use the same mask, and both shuffles have the same first
	// or second operand, then it might still be profitable to move the shuffle
	// after the xor/and/or operation.
	if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
	ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
	ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);

	assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
	"Inputs to shuffles are not the same type");

	// Check that both shuffles use the same mask. The masks are known to be of
	// the same length because the result vector type is the same.
	// Check also that shuffles have only one use to avoid introducing extra
	// instructions.
	if (SVN0->hasOneUse() && SVN1->hasOneUse() &&
	SVN0->getMask().equals(SVN1->getMask())) {
	SDValue ShOp = N0->getOperand(1);

	// Don't try to fold this node if it requires introducing a
	// build vector of all zeros that might be illegal at this stage.
	if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
	if (!LegalTypes)
	ShOp = DAG.getConstant(0, SDLoc(N), VT);
	else
	ShOp = SDValue();
	}

	// (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
	// (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C)
	// (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
	if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
	SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	N0->getOperand(0), N1->getOperand(0));
	AddToWorklist(NewNode.getNode());
	return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
	SVN0->getMask());
	}

	// Don't try to fold this node if it requires introducing a
	// build vector of all zeros that might be illegal at this stage.
	ShOp = N0->getOperand(0);
	if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
	if (!LegalTypes)
	ShOp = DAG.getConstant(0, SDLoc(N), VT);
	else
	ShOp = SDValue();
	}

	// (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
	// (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B))
	// (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
	if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
	SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	N0->getOperand(1), N1->getOperand(1));
	AddToWorklist(NewNode.getNode());
	return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
	SVN0->getMask());
	}
	}
	}

	return SDValue();
	}

	/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
	SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
	const SDLoc &DL) {
	SDValue LL, LR, RL, RR, N0CC, N1CC;
	if (!isSetCCEquivalent(N0, LL, LR, N0CC) \|\|
	!isSetCCEquivalent(N1, RL, RR, N1CC))
	return SDValue();

	assert(N0.getValueType() == N1.getValueType() &&
	"Unexpected operand types for bitwise logic op");
	assert(LL.getValueType() == LR.getValueType() &&
	RL.getValueType() == RR.getValueType() &&
	"Unexpected operand types for setcc");

	// If we're here post-legalization or the logic op type is not i1, the logic
	// op type must match a setcc result type. Also, all folds require new
	// operations on the left and right operands, so those types must match.
	EVT VT = N0.getValueType();
	EVT OpVT = LL.getValueType();
	if (LegalOperations \|\| VT != MVT::i1)
	if (VT != getSetCCResultType(OpVT))
	return SDValue();
	if (OpVT != RL.getValueType())
	return SDValue();

	ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
	ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
	bool IsInteger = OpVT.isInteger();
	if (LR == RR && CC0 == CC1 && IsInteger) {
	bool IsZero = isNullConstantOrNullSplatConstant(LR);
	bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);

	// All bits clear?
	bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
	// All sign bits clear?
	bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
	// Any bits set?
	bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
	// Any sign bits set?
	bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;

	// (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0)
	// (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
	// (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0)
	// (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0)
	if (AndEqZero \|\| AndGtNeg1 \|\| OrNeZero \|\| OrLtZero) {
	SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
	AddToWorklist(Or.getNode());
	return DAG.getSetCC(DL, VT, Or, LR, CC1);
	}

	// All bits set?
	bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
	// All sign bits set?
	bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
	// Any bits clear?
	bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
	// Any sign bits clear?
	bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;

	// (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
	// (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0)
	// (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
	// (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1)
	if (AndEqNeg1 \|\| AndLtZero \|\| OrNeNeg1 \|\| OrGtNeg1) {
	SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
	AddToWorklist(And.getNode());
	return DAG.getSetCC(DL, VT, And, LR, CC1);
	}
	}

	// TODO: What is the 'or' equivalent of this fold?
	// (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
	if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
	IsInteger && CC0 == ISD::SETNE &&
	((isNullConstant(LR) && isAllOnesConstant(RR)) \|\|
	(isAllOnesConstant(LR) && isNullConstant(RR)))) {
	SDValue One = DAG.getConstant(1, DL, OpVT);
	SDValue Two = DAG.getConstant(2, DL, OpVT);
	SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
	AddToWorklist(Add.getNode());
	return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
	}

	// Try more general transforms if the predicates match and the only user of
	// the compares is the 'and' or 'or'.
	if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
	N0.hasOneUse() && N1.hasOneUse()) {
	// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
	// or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
	if ((IsAnd && CC1 == ISD::SETEQ) \|\| (!IsAnd && CC1 == ISD::SETNE)) {
	SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
	SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
	SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
	SDValue Zero = DAG.getConstant(0, DL, OpVT);
	return DAG.getSetCC(DL, VT, Or, Zero, CC1);
	}
	}

	// Canonicalize equivalent operands to LL == RL.
	if (LL == RR && LR == RL) {
	CC1 = ISD::getSetCCSwappedOperands(CC1);
	std::swap(RL, RR);
	}

	// (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
	// (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
	if (LL == RL && LR == RR) {
	ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger)
	: ISD::getSetCCOrOperation(CC0, CC1, IsInteger);
	if (NewCC != ISD::SETCC_INVALID &&
	(!LegalOperations \|\|
	(TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
	TLI.isOperationLegal(ISD::SETCC, OpVT))))
	return DAG.getSetCC(DL, VT, LL, LR, NewCC);
	}

	return SDValue();
	}

	/// This contains all DAGCombine rules which reduce two values combined by
	/// an And operation to a single value. This makes them reusable in the context
	/// of visitSELECT(). Rules involving constants are not included as
	/// visitSELECT() already handles those cases.
	SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N1.getValueType();
	SDLoc DL(N);

	// fold (and x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
	return V;

	if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
	VT.getSizeInBits() <= 64) {
	if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
	// Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
	// immediate for an add, but it is legal if its top c2 bits are set,
	// transform the ADD so the immediate doesn't need to be materialized
	// in a register.
	APInt ADDC = ADDI->getAPIntValue();
	APInt SRLC = SRLI->getAPIntValue();
	if (ADDC.getMinSignedBits() <= 64 &&
	SRLC.ult(VT.getSizeInBits()) &&
	!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
	APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
	SRLC.getZExtValue());
	if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
	ADDC \|= Mask;
	if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
	SDLoc DL0(N0);
	SDValue NewAdd =
	DAG.getNode(ISD::ADD, DL0, VT,
	N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
	CombineTo(N0.getNode(), NewAdd);
	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	}
	}
	}
	}
	}

	// Reduce bit extract of low half of an integer to the narrower type.
	// (and (srl i64:x, K), KMask) ->
	// (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
	if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
	if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	unsigned Size = VT.getSizeInBits();
	const APInt &AndMask = CAnd->getAPIntValue();
	unsigned ShiftBits = CShift->getZExtValue();

	// Bail out, this node will probably disappear anyway.
	if (ShiftBits == 0)
	return SDValue();

	unsigned MaskBits = AndMask.countTrailingOnes();
	EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);

	if (AndMask.isMask() &&
	// Required bits must not span the two halves of the integer and
	// must fit in the half size type.
	(ShiftBits + MaskBits <= Size / 2) &&
	TLI.isNarrowingProfitable(VT, HalfVT) &&
	TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
	TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
	TLI.isTruncateFree(VT, HalfVT) &&
	TLI.isZExtFree(HalfVT, VT)) {
	// The isNarrowingProfitable is to avoid regressions on PPC and
	// AArch64 which match a few 64-bit bit insert / bit extract patterns
	// on downstream users of this. Those patterns could probably be
	// extended to handle extensions mixed in.

	SDValue SL(N0);
	assert(MaskBits <= Size);

	// Extracting the highest bit of the low half.
	EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
	N0.getOperand(0));

	SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
	SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
	SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
	SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
	return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
	}
	}
	}
	}

	return SDValue();
	}

	bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode AndC, LoadSDNode LoadN,
	EVT LoadResultTy, EVT &ExtVT) {
	if (!AndC->getAPIntValue().isMask())
	return false;

	unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();

	ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	EVT LoadedVT = LoadN->getMemoryVT();

	if (ExtVT == LoadedVT &&
	(!LegalOperations \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
	// ZEXTLOAD will match without needing to change the size of the value being
	// loaded.
	return true;
	}

	// Do not change the width of a volatile load.
	if (LoadN->isVolatile())
	return false;

	// Do not generate loads of non-round integer types since these can
	// be expensive (and would be wrong if the type is not byte sized).
	if (!LoadedVT.bitsGT(ExtVT) \|\| !ExtVT.isRound())
	return false;

	if (LegalOperations &&
	!TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
	return false;

	if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
	return false;

	return true;
	}

	bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
	EVT &ExtVT, unsigned ShAmt) {
	// Don't transform one with multiple uses, this would require adding a new
	// load.
	if (!SDValue(LoadN, 0).hasOneUse())
	return false;

	if (LegalOperations &&
	!TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT))
	return false;

	// Do not generate loads of non-round integer types since these can
	// be expensive (and would be wrong if the type is not byte sized).
	if (!ExtVT.isRound())
	return false;

	// Don't change the width of a volatile load.
	if (LoadN->isVolatile())
	return false;

	// Verify that we are actually reducing a load width here.
	if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits())
	return false;

	// For the transform to be legal, the load must produce only two values
	// (the value loaded and the chain). Don't transform a pre-increment
	// load, for example, which produces an extra value. Otherwise the
	// transformation is not equivalent, and the downstream logic to replace
	// uses gets things wrong.
	if (LoadN->getNumValues() > 2)
	return false;

	// If the load that we're shrinking is an extload and we're not just
	// discarding the extension we can't simply shrink the load. Bail.
	// TODO: It would be possible to merge the extensions in some cases.
	if (LoadN->getExtensionType() != ISD::NON_EXTLOAD &&
	LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
	return false;

	if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT))
	return false;

	// It's not possible to generate a constant of extended or untyped type.
	EVT PtrType = LoadN->getOperand(1).getValueType();
	if (PtrType == MVT::Untyped \|\| PtrType.isExtended())
	return false;

	return true;
	}

	bool DAGCombiner::SearchForAndLoads(SDNode *N,
	SmallPtrSetImpl<LoadSDNode*> &Loads,
	SmallPtrSetImpl<SDNode*> &NodesWithConsts,
	ConstantSDNode *Mask,
	SDNode *&NodeToMask) {
	// Recursively search for the operands, looking for loads which can be
	// narrowed.
	for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.getValueType().isVector())
	return false;

	// Some constants may need fixing up later if they are too large.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	if ((N->getOpcode() == ISD::OR \|\| N->getOpcode() == ISD::XOR) &&
	(Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
	NodesWithConsts.insert(N);
	continue;
	}

	if (!Op.hasOneUse())
	return false;

	switch(Op.getOpcode()) {
	case ISD::LOAD: {
	auto *Load = cast<LoadSDNode>(Op);
	EVT ExtVT;
	if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
	isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {
	- // Only add this load if we can make it more narrow.
	- if (ExtVT.bitsLT(Load->getMemoryVT()))
	+
	+ // ZEXTLOAD is already small enough.
	+ if (Load->getExtensionType() == ISD::ZEXTLOAD &&
	+ ExtVT.bitsGE(Load->getMemoryVT()))
	+ continue;
	+
	+ // Use LE to convert equal sized loads to zext.
	+ if (ExtVT.bitsLE(Load->getMemoryVT()))
	Loads.insert(Load);
	+
	continue;
	}
	return false;
	}
	case ISD::ZERO_EXTEND:
	case ISD::AssertZext: {
	unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
	EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	EVT VT = Op.getOpcode() == ISD::AssertZext ?
	cast<VTSDNode>(Op.getOperand(1))->getVT() :
	Op.getOperand(0).getValueType();

	// We can accept extending nodes if the mask is wider or an equal
	// width to the original type.
	if (ExtVT.bitsGE(VT))
	continue;
	break;
	}
	case ISD::OR:
	case ISD::XOR:
	case ISD::AND:
	if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
	NodeToMask))
	return false;
	continue;
	}

	// Allow one node which will masked along with any loads found.
	if (NodeToMask)
	return false;
	NodeToMask = Op.getNode();
	}
	return true;
	}

	bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
	auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!Mask)
	return false;

	if (!Mask->getAPIntValue().isMask())
	return false;

	// No need to do anything if the and directly uses a load.
	if (isa<LoadSDNode>(N->getOperand(0)))
	return false;

	SmallPtrSet<LoadSDNode*, 8> Loads;
	SmallPtrSet<SDNode*, 2> NodesWithConsts;
	SDNode *FixupNode = nullptr;
	if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
	if (Loads.size() == 0)
	return false;

	+ DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
	SDValue MaskOp = N->getOperand(1);

	// If it exists, fixup the single node we allow in the tree that needs
	// masking.
	if (FixupNode) {
	+ DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
	SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
	FixupNode->getValueType(0),
	SDValue(FixupNode, 0), MaskOp);
	DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
	DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0),
	MaskOp);
	}

	// Narrow any constants that need it.
	for (auto *LogicN : NodesWithConsts) {
	- auto *C = cast<ConstantSDNode>(LogicN->getOperand(1));
	- SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0),
	- SDValue(C, 0), MaskOp);
	- DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And);
	+ SDValue Op0 = LogicN->getOperand(0);
	+ SDValue Op1 = LogicN->getOperand(1);
	+
	+ if (isa<ConstantSDNode>(Op0))
	+ std::swap(Op0, Op1);
	+
	+ SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
	+ Op1, MaskOp);
	+
	+ DAG.UpdateNodeOperands(LogicN, Op0, And);
	}

	// Create narrow loads.
	for (auto *Load : Loads) {
	+ DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
	SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
	SDValue(Load, 0), MaskOp);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
	DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp);
	SDValue NewLoad = ReduceLoadWidth(And.getNode());
	assert(NewLoad &&
	"Shouldn't be masking the load if it can't be narrowed");
	CombineTo(Load, NewLoad, NewLoad.getValue(1));
	}
	DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
	return true;
	}
	return false;
	}

	SDValue DAGCombiner::visitAND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N1.getValueType();

	// x & x --> x
	if (N0 == N1)
	return N0;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (and x, 0) -> 0, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	// do not return N0, because undef node may exist in N0
	return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()),
	SDLoc(N), N0.getValueType());
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	// do not return N1, because undef node may exist in N1
	return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()),
	SDLoc(N), N1.getValueType());

	// fold (and x, -1) -> x, vector edition
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllOnes(N1.getNode()))
	return N0;
	}

	// fold (and c1, c2) -> c1&c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);
	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
	// fold (and x, -1) -> x
	if (isAllOnesConstant(N1))
	return N0;
	// if (and x, c) is known to be zero, return 0
	unsigned BitWidth = VT.getScalarSizeInBits();
	if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(BitWidth)))
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate and
	if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
	return RAND;

	// Try to convert a constant mask AND into a shuffle clear mask.
	if (VT.isVector())
	if (SDValue Shuffle = XformToShuffleWithZero(N))
	return Shuffle;

	// fold (and (or x, C), D) -> D if (C & D) == D
	auto MatchSubset = [](ConstantSDNode LHS, ConstantSDNode RHS) {
	return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
	};
	if (N0.getOpcode() == ISD::OR &&
	matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
	return N1;
	// fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
	if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue N0Op0 = N0.getOperand(0);
	APInt Mask = ~N1C->getAPIntValue();
	Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
	if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
	N0.getValueType(), N0Op0);

	// Replace uses of the AND with uses of the Zero extend node.
	CombineTo(N, Zext);

	// We actually want to replace all uses of the any_extend with the
	// zero_extend, to avoid duplicating things. This will later cause this
	// AND to be folded.
	CombineTo(N0.getNode(), Zext);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	// similarly fold (and (X (load ([non_ext\|any_ext\|zero_ext] V))), c) ->
	// (X (load ([non_ext\|zero_ext] V))) if 'and' only clears top bits which must
	// already be zero by virtue of the width of the base type of the load.
	//
	// the 'X' node here can either be nothing or an extract_vector_elt to catch
	// more cases.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
	N0.getOperand(0).getOpcode() == ISD::LOAD &&
	N0.getOperand(0).getResNo() == 0) \|\|
	(N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
	LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
	N0 : N0.getOperand(0) );

	// Get the constant (if applicable) the zero'th operand is being ANDed with.
	// This can be a pure constant or a vector splat, in which case we treat the
	// vector as a scalar and use the splat value.
	APInt Constant = APInt::getNullValue(1);
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	Constant = C->getAPIntValue();
	} else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
	SplatBitSize, HasAnyUndefs);
	if (IsSplat) {
	// Undef bits can contribute to a possible optimisation if set, so
	// set them.
	SplatValue \|= SplatUndef;

	// The splat value may be something like "0x00FFFFFF", which means 0 for
	// the first vector value and FF for the rest, repeating. We need a mask
	// that will apply equally to all members of the vector, so AND all the
	// lanes of the constant together.
	EVT VT = Vector->getValueType(0);
	unsigned BitWidth = VT.getScalarSizeInBits();

	// If the splat value has been compressed to a bitlength lower
	// than the size of the vector lane, we need to re-expand it to
	// the lane size.
	if (BitWidth > SplatBitSize)
	for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
	SplatBitSize < BitWidth;
	SplatBitSize = SplatBitSize * 2)
	SplatValue \|= SplatValue.shl(SplatBitSize);

	// Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
	// multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
	if (SplatBitSize % BitWidth == 0) {
	Constant = APInt::getAllOnesValue(BitWidth);
	for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
	Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
	}
	}
	}

	// If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
	// actually legal and isn't going to get expanded, else this is a false
	// optimisation.
	bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
	Load->getValueType(0),
	Load->getMemoryVT());

	// Resize the constant to the same size as the original memory access before
	// extension. If it is still the AllOnesValue then this AND is completely
	// unneeded.
	Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());

	bool B;
	switch (Load->getExtensionType()) {
	default: B = false; break;
	case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
	case ISD::ZEXTLOAD:
	case ISD::NON_EXTLOAD: B = true; break;
	}

	if (B && Constant.isAllOnesValue()) {
	// If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
	// preserve semantics once we get rid of the AND.
	SDValue NewLoad(Load, 0);

	// Fold the AND away. NewLoad may get replaced immediately.
	CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);

	if (Load->getExtensionType() == ISD::EXTLOAD) {
	NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
	Load->getValueType(0), SDLoc(Load),
	Load->getChain(), Load->getBasePtr(),
	Load->getOffset(), Load->getMemoryVT(),
	Load->getMemOperand());
	// Replace uses of the EXTLOAD with the new ZEXTLOAD.
	if (Load->getNumValues() == 3) {
	// PRE/POST_INC loads have 3 values.
	SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
	NewLoad.getValue(2) };
	CombineTo(Load, To, 3, true);
	} else {
	CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
	}
	}

	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (and (load x), 255) -> (zextload x, i8)
	// fold (and (extload x, i16), 255) -> (zextload x, i8)
	// fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
	if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD \|\|
	(N0.getOpcode() == ISD::ANY_EXTEND &&
	N0.getOperand(0).getOpcode() == ISD::LOAD))) {
	if (SDValue Res = ReduceLoadWidth(N)) {
	LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
	? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);

	AddToWorklist(N);
	CombineTo(LN0, Res, Res.getValue(1));
	return SDValue(N, 0);
	}
	}

	if (Level >= AfterLegalizeTypes) {
	// Attempt to propagate the AND back up to the leaves which, if they're
	// loads, can be combined to narrow loads and the AND node can be removed.
	// Perform after legalization so that extend nodes will already be
	// combined into the loads.
	if (BackwardsPropagateMask(N, DAG)) {
	return SDValue(N, 0);
	}
	}

	if (SDValue Combined = visitANDLike(N0, N1, N))
	return Combined;

	// Simplify: (and (op x...), (op y...)) -> (op (and x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
	return Tmp;

	// Masking the negated extension of a boolean is just the zero-extended
	// boolean:
	// and (sub 0, zext(bool X)), 1 --> zext(bool X)
	// and (sub 0, sext(bool X)), 1 --> zext(bool X)
	//
	// Note: the SimplifyDemandedBits fold below can make an information-losing
	// transform, and then we have no way to find this better fold.
	if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
	if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
	SDValue SubRHS = N0.getOperand(1);
	if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
	SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
	return SubRHS;
	if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
	SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
	}
	}

	// fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
	// fold (and (sra)) -> (and (srl)) when possible.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (zext_inreg (extload x)) -> (zextload x)
	if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	// If we zero all the possible extended bits, then we can turn this into
	// a zextload if we are running before legalize or the operation is legal.
	unsigned BitWidth = N1.getScalarValueSizeInBits();
	if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
	BitWidth - MemVT.getScalarSizeInBits())) &&
	((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	AddToWorklist(N);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	// fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
	if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	// If we zero all the possible extended bits, then we can turn this into
	// a zextload if we are running before legalize or the operation is legal.
	unsigned BitWidth = N1.getScalarValueSizeInBits();
	if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
	BitWidth - MemVT.getScalarSizeInBits())) &&
	((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	AddToWorklist(N);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	// fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
	if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
	if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
	N0.getOperand(1), false))
	return BSwap;
	}

	return SDValue();
	}

	/// Match (a >> 8) \| (a << 8) as (bswap a) >> 16.
	SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
	bool DemandHighBits) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
	return SDValue();
	if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
	return SDValue();

	// Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
	bool LookPassAnd0 = false;
	bool LookPassAnd1 = false;
	if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
	std::swap(N0, N1);
	if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() == ISD::AND) {
	if (!N0.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N01C \|\| N01C->getZExtValue() != 0xFF00)
	return SDValue();
	N0 = N0.getOperand(0);
	LookPassAnd0 = true;
	}

	if (N1.getOpcode() == ISD::AND) {
	if (!N1.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N11C \|\| N11C->getZExtValue() != 0xFF)
	return SDValue();
	N1 = N1.getOperand(0);
	LookPassAnd1 = true;
	}

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.getNode()->hasOneUse() \|\| !N1.getNode()->hasOneUse())
	return SDValue();

	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N01C \|\| !N11C)
	return SDValue();
	if (N01C->getZExtValue() != 8 \|\| N11C->getZExtValue() != 8)
	return SDValue();

	// Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
	SDValue N00 = N0->getOperand(0);
	if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
	if (!N00.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
	if (!N001C \|\| N001C->getZExtValue() != 0xFF)
	return SDValue();
	N00 = N00.getOperand(0);
	LookPassAnd0 = true;
	}

	SDValue N10 = N1->getOperand(0);
	if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
	if (!N10.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
	if (!N101C \|\| N101C->getZExtValue() != 0xFF00)
	return SDValue();
	N10 = N10.getOperand(0);
	LookPassAnd1 = true;
	}

	if (N00 != N10)
	return SDValue();

	// Make sure everything beyond the low halfword gets set to zero since the SRL
	// 16 will clear the top bits.
	unsigned OpSizeInBits = VT.getSizeInBits();
	if (DemandHighBits && OpSizeInBits > 16) {
	// If the left-shift isn't masked out then the only way this is a bswap is
	// if all bits beyond the low 8 are 0. In that case the entire pattern
	// reduces to a left shift anyway: leave it for other parts of the combiner.
	if (!LookPassAnd0)
	return SDValue();

	// However, if the right shift isn't masked out then it might be because
	// it's not needed. See if we can spot that too.
	if (!LookPassAnd1 &&
	!DAG.MaskedValueIsZero(
	N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
	return SDValue();
	}

	SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
	if (OpSizeInBits > 16) {
	SDLoc DL(N);
	Res = DAG.getNode(ISD::SRL, DL, VT, Res,
	DAG.getConstant(OpSizeInBits - 16, DL,
	getShiftAmountTy(VT)));
	}
	return Res;
	}

	/// Return true if the specified node is an element that makes up a 32-bit
	/// packed halfword byteswap.
	/// ((x & 0x000000ff) << 8) \|
	/// ((x & 0x0000ff00) >> 8) \|
	/// ((x & 0x00ff0000) << 8) \|
	/// ((x & 0xff000000) >> 8)
	static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
	if (!N.getNode()->hasOneUse())
	return false;

	unsigned Opc = N.getOpcode();
	if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
	return false;

	SDValue N0 = N.getOperand(0);
	unsigned Opc0 = N0.getOpcode();
	if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
	return false;

	ConstantSDNode *N1C = nullptr;
	// SHL or SRL: look upstream for AND mask operand
	if (Opc == ISD::AND)
	N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	else if (Opc0 == ISD::AND)
	N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N1C)
	return false;

	unsigned MaskByteOffset;
	switch (N1C->getZExtValue()) {
	default:
	return false;
	case 0xFF: MaskByteOffset = 0; break;
	case 0xFF00: MaskByteOffset = 1; break;
	case 0xFF0000: MaskByteOffset = 2; break;
	case 0xFF000000: MaskByteOffset = 3; break;
	}

	// Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
	if (Opc == ISD::AND) {
	if (MaskByteOffset == 0 \|\| MaskByteOffset == 2) {
	// (x >> 8) & 0xff
	// (x >> 8) & 0xff0000
	if (Opc0 != ISD::SRL)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	} else {
	// (x << 8) & 0xff00
	// (x << 8) & 0xff000000
	if (Opc0 != ISD::SHL)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	}
	} else if (Opc == ISD::SHL) {
	// (x & 0xff) << 8
	// (x & 0xff0000) << 8
	if (MaskByteOffset != 0 && MaskByteOffset != 2)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	} else { // Opc == ISD::SRL
	// (x & 0xff00) >> 8
	// (x & 0xff000000) >> 8
	if (MaskByteOffset != 1 && MaskByteOffset != 3)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	}

	if (Parts[MaskByteOffset])
	return false;

	Parts[MaskByteOffset] = N0.getOperand(0).getNode();
	return true;
	}

	/// Match a 32-bit packed halfword bswap. That is
	/// ((x & 0x000000ff) << 8) \|
	/// ((x & 0x0000ff00) >> 8) \|
	/// ((x & 0x00ff0000) << 8) \|
	/// ((x & 0xff000000) >> 8)
	/// => (rotl (bswap x), 16)
	SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32)
	return SDValue();
	if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
	return SDValue();

	// Look for either
	// (or (or (and), (and)), (or (and), (and)))
	// (or (or (or (and), (and)), (and)), (and))
	if (N0.getOpcode() != ISD::OR)
	return SDValue();
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDNode *Parts[4] = {};

	if (N1.getOpcode() == ISD::OR &&
	N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
	// (or (or (and), (and)), (or (and), (and)))
	if (!isBSwapHWordElement(N00, Parts))
	return SDValue();

	if (!isBSwapHWordElement(N01, Parts))
	return SDValue();
	SDValue N10 = N1.getOperand(0);
	if (!isBSwapHWordElement(N10, Parts))
	return SDValue();
	SDValue N11 = N1.getOperand(1);
	if (!isBSwapHWordElement(N11, Parts))
	return SDValue();
	} else {
	// (or (or (or (and), (and)), (and)), (and))
	if (!isBSwapHWordElement(N1, Parts))
	return SDValue();
	if (!isBSwapHWordElement(N01, Parts))
	return SDValue();
	if (N00.getOpcode() != ISD::OR)
	return SDValue();
	SDValue N000 = N00.getOperand(0);
	if (!isBSwapHWordElement(N000, Parts))
	return SDValue();
	SDValue N001 = N00.getOperand(1);
	if (!isBSwapHWordElement(N001, Parts))
	return SDValue();
	}

	// Make sure the parts are all coming from the same node.
	if (Parts[0] != Parts[1] \|\| Parts[0] != Parts[2] \|\| Parts[0] != Parts[3])
	return SDValue();

	SDLoc DL(N);
	SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
	SDValue(Parts[0], 0));

	// Result of the bswap should be rotated by 16. If it's not legal, then
	// do (x << 16) \| (x >> 16).
	SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
	if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
	return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
	if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
	return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
	DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
	}

	/// This contains all DAGCombine rules which reduce two values combined by
	/// an Or operation to a single value \see visitANDLike().
	SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N1.getValueType();
	SDLoc DL(N);

	// fold (or x, undef) -> -1
	if (!LegalOperations && (N0.isUndef() \|\| N1.isUndef()))
	return DAG.getAllOnesConstant(DL, VT);

	if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
	return V;

	// (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible.
	if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
	// Don't increase # computations.
	(N0.getNode()->hasOneUse() \|\| N1.getNode()->hasOneUse())) {
	// We can only do this xform if we know that bits from X that are set in C2
	// but not in C1 are already zero. Likewise for Y.
	if (const ConstantSDNode *N0O1C =
	getAsNonOpaqueConstant(N0.getOperand(1))) {
	if (const ConstantSDNode *N1O1C =
	getAsNonOpaqueConstant(N1.getOperand(1))) {
	// We can only do this xform if we know that bits from X that are set in
	// C2 but not in C1 are already zero. Likewise for Y.
	const APInt &LHSMask = N0O1C->getAPIntValue();
	const APInt &RHSMask = N1O1C->getAPIntValue();

	if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
	DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
	SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
	N0.getOperand(0), N1.getOperand(0));
	return DAG.getNode(ISD::AND, DL, VT, X,
	DAG.getConstant(LHSMask \| RHSMask, DL, VT));
	}
	}
	}
	}

	// (or (and X, M), (and X, N)) -> (and X, (or M, N))
	if (N0.getOpcode() == ISD::AND &&
	N1.getOpcode() == ISD::AND &&
	N0.getOperand(0) == N1.getOperand(0) &&
	// Don't increase # computations.
	(N0.getNode()->hasOneUse() \|\| N1.getNode()->hasOneUse())) {
	SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
	N0.getOperand(1), N1.getOperand(1));
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N1.getValueType();

	// x \| x --> x
	if (N0 == N1)
	return N0;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (or x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;

	// fold (or x, -1) -> -1, vector edition
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	// do not return N0, because undef node may exist in N0
	return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
	if (ISD::isBuildVectorAllOnes(N1.getNode()))
	// do not return N1, because undef node may exist in N1
	return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());

	// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
	// Do this only if the resulting shuffle is legal.
	if (isa<ShuffleVectorSDNode>(N0) &&
	isa<ShuffleVectorSDNode>(N1) &&
	// Avoid folding a node with illegal type.
	TLI.isTypeLegal(VT)) {
	bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
	bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
	bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
	bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
	// Ensure both shuffles have a zero input.
	if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
	assert((!ZeroN00 \|\| !ZeroN01) && "Both inputs zero!");
	assert((!ZeroN10 \|\| !ZeroN11) && "Both inputs zero!");
	const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
	const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
	bool CanFold = true;
	int NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> Mask(NumElts);

	for (int i = 0; i != NumElts; ++i) {
	int M0 = SV0->getMaskElt(i);
	int M1 = SV1->getMaskElt(i);

	// Determine if either index is pointing to a zero vector.
	bool M0Zero = M0 < 0 \|\| (ZeroN00 == (M0 < NumElts));
	bool M1Zero = M1 < 0 \|\| (ZeroN10 == (M1 < NumElts));

	// If one element is zero and the otherside is undef, keep undef.
	// This also handles the case that both are undef.
	if ((M0Zero && M1 < 0) \|\| (M1Zero && M0 < 0)) {
	Mask[i] = -1;
	continue;
	}

	// Make sure only one of the elements is zero.
	if (M0Zero == M1Zero) {
	CanFold = false;
	break;
	}

	assert((M0 >= 0 \|\| M1 >= 0) && "Undef index!");

	// We have a zero and non-zero element. If the non-zero came from
	// SV0 make the index a LHS index. If it came from SV1, make it
	// a RHS index. We need to mod by NumElts because we don't care
	// which operand it came from in the original shuffles.
	Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
	}

	if (CanFold) {
	SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
	SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);

	bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
	if (!LegalMask) {
	std::swap(NewLHS, NewRHS);
	ShuffleVectorSDNode::commuteMask(Mask);
	LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
	}

	if (LegalMask)
	return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
	}
	}
	}
	}

	// fold (or c1, c2) -> c1\|c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);
	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
	// fold (or x, 0) -> x
	if (isNullConstant(N1))
	return N0;
	// fold (or x, -1) -> -1
	if (isAllOnesConstant(N1))
	return N1;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (or x, c) -> c iff (x & ~c) == 0
	if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
	return N1;

	if (SDValue Combined = visitORLike(N0, N1, N))
	return Combined;

	// Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
	if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
	return BSwap;
	if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
	return BSwap;

	// reassociate or
	if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
	return ROR;

	// Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1\|c2)
	// iff (c1 & c2) != 0.
	auto MatchIntersect = [](ConstantSDNode LHS, ConstantSDNode RHS) {
	return LHS->getAPIntValue().intersects(RHS->getAPIntValue());
	};
	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
	matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) {
	if (SDValue COR = DAG.FoldConstantArithmetic(
	ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {
	SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
	AddToWorklist(IOR.getNode());
	return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
	}
	}

	// Simplify: (or (op x...), (op y...)) -> (op (or x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
	return Tmp;

	// See if this is some rotate idiom.
	if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
	return SDValue(Rot, 0);

	if (SDValue Load = MatchLoadCombine(N))
	return Load;

	// Simplify the operands using demanded-bits information.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Match "(X shl/srl V1) & V2" where V2 may not be present.
	bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
	if (Op.getOpcode() == ISD::AND) {
	if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
	Mask = Op.getOperand(1);
	Op = Op.getOperand(0);
	} else {
	return false;
	}
	}

	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) {
	Shift = Op;
	return true;
	}

	return false;
	}

	// Return true if we can prove that, whenever Neg and Pos are both in the
	// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
	// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
	//
	// (or (shift1 X, Neg), (shift2 X, Pos))
	//
	// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
	// in direction shift1 by Neg. The range [0, EltSize) means that we only need
	// to consider shift amounts with defined behavior.
	static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
	// If EltSize is a power of 2 then:
	//
	// (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
	// (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
	//
	// So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
	// for the stronger condition:
	//
	// Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
	//
	// for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
	// we can just replace Neg with Neg' for the rest of the function.
	//
	// In other cases we check for the even stronger condition:
	//
	// Neg == EltSize - Pos [B]
	//
	// for all Neg and Pos. Note that the (or ...) then invokes undefined
	// behavior if Pos == 0 (and consequently Neg == EltSize).
	//
	// We could actually use [A] whenever EltSize is a power of 2, but the
	// only extra cases that it would match are those uninteresting ones
	// where Neg and Pos are never in range at the same time. E.g. for
	// EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
	// as well as (sub 32, Pos), but:
	//
	// (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
	//
	// always invokes undefined behavior for 32-bit X.
	//
	// Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
	unsigned MaskLoBits = 0;
	if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
	if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
	if (NegC->getAPIntValue() == EltSize - 1) {
	Neg = Neg.getOperand(0);
	MaskLoBits = Log2_64(EltSize);
	}
	}
	}

	// Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
	if (Neg.getOpcode() != ISD::SUB)
	return false;
	ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
	if (!NegC)
	return false;
	SDValue NegOp1 = Neg.getOperand(1);

	// On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
	// Pos'. The truncation is redundant for the purpose of the equality.
	if (MaskLoBits && Pos.getOpcode() == ISD::AND)
	if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
	if (PosC->getAPIntValue() == EltSize - 1)
	Pos = Pos.getOperand(0);

	// The condition we need is now:
	//
	// (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
	//
	// If NegOp1 == Pos then we need:
	//
	// EltSize & Mask == NegC & Mask
	//
	// (because "x & Mask" is a truncation and distributes through subtraction).
	APInt Width;
	if (Pos == NegOp1)
	Width = NegC->getAPIntValue();

	// Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
	// Then the condition we want to prove becomes:
	//
	// (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
	//
	// which, again because "x & Mask" is a truncation, becomes:
	//
	// NegC & Mask == (EltSize - PosC) & Mask
	// EltSize & Mask == (NegC + PosC) & Mask
	else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
	if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
	Width = PosC->getAPIntValue() + NegC->getAPIntValue();
	else
	return false;
	} else
	return false;

	// Now we just need to check that EltSize & Mask == Width & Mask.
	if (MaskLoBits)
	// EltSize & Mask is 0 since Mask is EltSize - 1.
	return Width.getLoBits(MaskLoBits) == 0;
	return Width == EltSize;
	}

	// A subroutine of MatchRotate used once we have found an OR of two opposite
	// shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces
	// to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
	// former being preferred if supported. InnerPos and InnerNeg are Pos and
	// Neg with outer conversions stripped away.
	SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
	SDValue Neg, SDValue InnerPos,
	SDValue InnerNeg, unsigned PosOpcode,
	unsigned NegOpcode, const SDLoc &DL) {
	// fold (or (shl x, (*ext y)),
	// (srl x, (*ext (sub 32, y)))) ->
	// (rotl x, y) or (rotr x, (sub 32, y))
	//
	// fold (or (shl x, (*ext (sub 32, y))),
	// (srl x, (*ext y))) ->
	// (rotr x, y) or (rotl x, (sub 32, y))
	EVT VT = Shifted.getValueType();
	if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
	bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
	return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
	HasPos ? Pos : Neg).getNode();
	}

	return nullptr;
	}

	// MatchRotate - Handle an 'or' of two operands. If this is one of the many
	// idioms for rotate, and if the target supports rotation instructions, generate
	// a rot[lr].
	SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
	// Must be a legal type. Expanded 'n promoted things won't work with rotates.
	EVT VT = LHS.getValueType();
	if (!TLI.isTypeLegal(VT)) return nullptr;

	// The target must have at least one rotate flavor.
	bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
	bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
	if (!HasROTL && !HasROTR) return nullptr;

	// Check for truncated rotate.
	if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
	LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
	assert(LHS.getValueType() == RHS.getValueType());
	if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
	return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(),
	SDValue(Rot, 0)).getNode();
	}
	}

	// Match "(X shl/srl V1) & V2" where V2 may not be present.
	SDValue LHSShift; // The shift.
	SDValue LHSMask; // AND value if any.
	if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
	return nullptr; // Not part of a rotate.

	SDValue RHSShift; // The shift.
	SDValue RHSMask; // AND value if any.
	if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
	return nullptr; // Not part of a rotate.

	if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
	return nullptr; // Not shifting the same value.

	if (LHSShift.getOpcode() == RHSShift.getOpcode())
	return nullptr; // Shifts must disagree.

	// Canonicalize shl to left side in a shl/srl pair.
	if (RHSShift.getOpcode() == ISD::SHL) {
	std::swap(LHS, RHS);
	std::swap(LHSShift, RHSShift);
	std::swap(LHSMask, RHSMask);
	}

	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue LHSShiftArg = LHSShift.getOperand(0);
	SDValue LHSShiftAmt = LHSShift.getOperand(1);
	SDValue RHSShiftArg = RHSShift.getOperand(0);
	SDValue RHSShiftAmt = RHSShift.getOperand(1);

	// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
	// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
	auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
	};
	if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
	SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
	LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);

	// If there is an AND of either shifted operand, apply it to the result.
	if (LHSMask.getNode() \|\| RHSMask.getNode()) {
	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
	SDValue Mask = AllOnes;

	if (LHSMask.getNode()) {
	SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
	Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
	DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
	}
	if (RHSMask.getNode()) {
	SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
	Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
	DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
	}

	Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
	}

	return Rot.getNode();
	}

	// If there is a mask here, and we have a variable shift, we can't be sure
	// that we're masking out the right stuff.
	if (LHSMask.getNode() \|\| RHSMask.getNode())
	return nullptr;

	// If the shift amount is sign/zext/any-extended just peel it off.
	SDValue LExtOp0 = LHSShiftAmt;
	SDValue RExtOp0 = RHSShiftAmt;
	if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
	(RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
	LExtOp0 = LHSShiftAmt.getOperand(0);
	RExtOp0 = RHSShiftAmt.getOperand(0);
	}

	SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
	LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
	if (TryL)
	return TryL;

	SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
	RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
	if (TryR)
	return TryR;

	return nullptr;
	}

	namespace {

	/// Represents known origin of an individual byte in load combine pattern. The
	/// value of the byte is either constant zero or comes from memory.
	struct ByteProvider {
	// For constant zero providers Load is set to nullptr. For memory providers
	// Load represents the node which loads the byte from memory.
	// ByteOffset is the offset of the byte in the value produced by the load.
	LoadSDNode *Load = nullptr;
	unsigned ByteOffset = 0;

	ByteProvider() = default;

	static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
	return ByteProvider(Load, ByteOffset);
	}

	static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }

	bool isConstantZero() const { return !Load; }
	bool isMemory() const { return Load; }

	bool operator==(const ByteProvider &Other) const {
	return Other.Load == Load && Other.ByteOffset == ByteOffset;
	}

	private:
	ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
	: Load(Load), ByteOffset(ByteOffset) {}
	};

	} // end anonymous namespace

	/// Recursively traverses the expression calculating the origin of the requested
	/// byte of the given value. Returns None if the provider can't be calculated.
	///
	/// For all the values except the root of the expression verifies that the value
	/// has exactly one use and if it's not true return None. This way if the origin
	/// of the byte is returned it's guaranteed that the values which contribute to
	/// the byte are not used outside of this expression.
	///
	/// Because the parts of the expression are not allowed to have more than one
	/// use this function iterates over trees, not DAGs. So it never visits the same
	/// node more than once.
	static const Optional<ByteProvider>
	calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
	bool Root = false) {
	// Typical i64 by i8 pattern requires recursion up to 8 calls depth
	if (Depth == 10)
	return None;

	if (!Root && !Op.hasOneUse())
	return None;

	assert(Op.getValueType().isScalarInteger() && "can't handle other types");
	unsigned BitWidth = Op.getValueSizeInBits();
	if (BitWidth % 8 != 0)
	return None;
	unsigned ByteWidth = BitWidth / 8;
	assert(Index < ByteWidth && "invalid index requested");
	(void) ByteWidth;

	switch (Op.getOpcode()) {
	case ISD::OR: {
	auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
	if (!LHS)
	return None;
	auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
	if (!RHS)
	return None;

	if (LHS->isConstantZero())
	return RHS;
	if (RHS->isConstantZero())
	return LHS;
	return None;
	}
	case ISD::SHL: {
	auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!ShiftOp)
	return None;

	uint64_t BitShift = ShiftOp->getZExtValue();
	if (BitShift % 8 != 0)
	return None;
	uint64_t ByteShift = BitShift / 8;

	return Index < ByteShift
	? ByteProvider::getConstantZero()
	: calculateByteProvider(Op->getOperand(0), Index - ByteShift,
	Depth + 1);
	}
	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	SDValue NarrowOp = Op->getOperand(0);
	unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
	if (NarrowBitWidth % 8 != 0)
	return None;
	uint64_t NarrowByteWidth = NarrowBitWidth / 8;

	if (Index >= NarrowByteWidth)
	return Op.getOpcode() == ISD::ZERO_EXTEND
	? Optional<ByteProvider>(ByteProvider::getConstantZero())
	: None;
	return calculateByteProvider(NarrowOp, Index, Depth + 1);
	}
	case ISD::BSWAP:
	return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
	Depth + 1);
	case ISD::LOAD: {
	auto L = cast<LoadSDNode>(Op.getNode());
	if (L->isVolatile() \|\| L->isIndexed())
	return None;

	unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
	if (NarrowBitWidth % 8 != 0)
	return None;
	uint64_t NarrowByteWidth = NarrowBitWidth / 8;

	if (Index >= NarrowByteWidth)
	return L->getExtensionType() == ISD::ZEXTLOAD
	? Optional<ByteProvider>(ByteProvider::getConstantZero())
	: None;
	return ByteProvider::getMemory(L, Index);
	}
	}

	return None;
	}

	/// Match a pattern where a wide type scalar value is loaded by several narrow
	/// loads and combined by shifts and ors. Fold it into a single load or a load
	/// and a BSWAP if the targets supports it.
	///
	/// Assuming little endian target:
	/// i8 *a = ...
	/// i32 val = a[0] \| (a[1] << 8) \| (a[2] << 16) \| (a[3] << 24)
	/// =>
	/// i32 val = *((i32)a)
	///
	/// i8 *a = ...
	/// i32 val = (a[0] << 24) \| (a[1] << 16) \| (a[2] << 8) \| a[3]
	/// =>
	/// i32 val = BSWAP(*((i32)a))
	///
	/// TODO: This rule matches complex patterns with OR node roots and doesn't
	/// interact well with the worklist mechanism. When a part of the pattern is
	/// updated (e.g. one of the loads) its direct users are put into the worklist,
	/// but the root node of the pattern which triggers the load combine is not
	/// necessarily a direct user of the changed node. For example, once the address
	/// of t28 load is reassociated load combine won't be triggered:
	/// t25: i32 = add t4, Constant:i32<2>
	/// t26: i64 = sign_extend t25
	/// t27: i64 = add t2, t26
	/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
	/// t29: i32 = zero_extend t28
	/// t32: i32 = shl t29, Constant:i8<8>
	/// t33: i32 = or t23, t32
	/// As a possible fix visitLoad can check if the load can be a part of a load
	/// combine pattern and add corresponding OR roots to the worklist.
	SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
	assert(N->getOpcode() == ISD::OR &&
	"Can only match load combining against OR nodes");

	// Handles simple types only
	EVT VT = N->getValueType(0);
	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();
	unsigned ByteWidth = VT.getSizeInBits() / 8;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// Before legalize we can introduce too wide illegal loads which will be later
	// split into legal sized loads. This enables us to combine i64 load by i8
	// patterns to a couple of i32 loads on 32 bit targets.
	if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = [](
	unsigned BW, unsigned i) { return i; };
	std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
	unsigned BW, unsigned i) { return BW - i - 1; };

	bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
	auto MemoryByteOffset = [&] (ByteProvider P) {
	assert(P.isMemory() && "Must be a memory byte provider");
	unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
	assert(LoadBitWidth % 8 == 0 &&
	"can only analyze providers for individual bytes not bit");
	unsigned LoadByteWidth = LoadBitWidth / 8;
	return IsBigEndianTarget
	? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
	: LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
	};

	Optional<BaseIndexOffset> Base;
	SDValue Chain;

	SmallSet<LoadSDNode *, 8> Loads;
	Optional<ByteProvider> FirstByteProvider;
	int64_t FirstOffset = INT64_MAX;

	// Check if all the bytes of the OR we are looking at are loaded from the same
	// base address. Collect bytes offsets from Base address in ByteOffsets.
	SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
	for (unsigned i = 0; i < ByteWidth; i++) {
	auto P = calculateByteProvider(SDValue(N, 0), i, 0, /Root=/true);
	if (!P \|\| !P->isMemory()) // All the bytes must be loaded from memory
	return SDValue();

	LoadSDNode *L = P->Load;
	assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
	"Must be enforced by calculateByteProvider");
	assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");

	// All loads must share the same chain
	SDValue LChain = L->getChain();
	if (!Chain)
	Chain = LChain;
	else if (Chain != LChain)
	return SDValue();

	// Loads must share the same base address
	- BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
	+ BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
	int64_t ByteOffsetFromBase = 0;
	if (!Base)
	Base = Ptr;
	else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
	return SDValue();

	// Calculate the offset of the current byte from the base address
	ByteOffsetFromBase += MemoryByteOffset(*P);
	ByteOffsets[i] = ByteOffsetFromBase;

	// Remember the first byte load
	if (ByteOffsetFromBase < FirstOffset) {
	FirstByteProvider = P;
	FirstOffset = ByteOffsetFromBase;
	}

	Loads.insert(L);
	}
	assert(!Loads.empty() && "All the bytes of the value must be loaded from "
	"memory, so there must be at least one load which produces the value");
	assert(Base && "Base address of the accessed memory location must be set");
	assert(FirstOffset != INT64_MAX && "First byte offset must be set");

	// Check if the bytes of the OR we are looking at match with either big or
	// little endian value load
	bool BigEndian = true, LittleEndian = true;
	for (unsigned i = 0; i < ByteWidth; i++) {
	int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
	LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
	BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
	if (!BigEndian && !LittleEndian)
	return SDValue();
	}
	assert((BigEndian != LittleEndian) && "should be either or");
	assert(FirstByteProvider && "must be set");

	// Ensure that the first byte is loaded from zero offset of the first load.
	// So the combined value can be loaded from the first load address.
	if (MemoryByteOffset(*FirstByteProvider) != 0)
	return SDValue();
	LoadSDNode *FirstLoad = FirstByteProvider->Load;

	// The node we are looking at matches with the pattern, check if we can
	// replace it with a single load and bswap if needed.

	// If the load needs byte swap check if the target supports it
	bool NeedsBswap = IsBigEndianTarget != BigEndian;

	// Before legalize we can introduce illegal bswaps which will be later
	// converted to an explicit bswap sequence. This way we end up with a single
	// load and byte shuffling instead of several loads and byte shuffling.
	if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
	return SDValue();

	// Check that a load of the wide type is both allowed and fast on the target
	bool Fast = false;
	bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	VT, FirstLoad->getAddressSpace(),
	FirstLoad->getAlignment(), &Fast);
	if (!Allowed \|\| !Fast)
	return SDValue();

	SDValue NewLoad =
	DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), FirstLoad->getAlignment());

	// Transfer chain users from old loads to the new load.
	for (LoadSDNode *L : Loads)
	DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));

	return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
	}

	SDValue DAGCombiner::visitXOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (xor x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (xor undef, undef) -> 0. This is a common idiom (misuse).
	if (N0.isUndef() && N1.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);
	// fold (xor x, undef) -> undef
	if (N0.isUndef())
	return N0;
	if (N1.isUndef())
	return N1;
	// fold (xor c1, c2) -> c1^c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
	if (N0C && N1C)
	return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C);
	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
	// fold (xor x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate xor
	if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
	return RXOR;

	// fold !(x cc y) -> (x !cc y)
	SDValue LHS, RHS, CC;
	if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
	bool isInt = LHS.getValueType().isInteger();
	ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	isInt);

	if (!LegalOperations \|\|
	TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
	switch (N0.getOpcode()) {
	default:
	llvm_unreachable("Unhandled SetCC Equivalent!");
	case ISD::SETCC:
	return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
	case ISD::SELECT_CC:
	return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
	N0.getOperand(3), NotCC);
	}
	}
	}

	// fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
	if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getNode()->hasOneUse() &&
	isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
	SDValue V = N0.getOperand(0);
	SDLoc DL(N0);
	V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V,
	DAG.getConstant(1, DL, V.getValueType()));
	AddToWorklist(V.getNode());
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
	}

	// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
	if (isOneConstant(N1) && VT == MVT::i1 &&
	(N0.getOpcode() == ISD::OR \|\| N0.getOpcode() == ISD::AND)) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	if (isOneUseSetCC(RHS) \|\| isOneUseSetCC(LHS)) {
	unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
	LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
	RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
	AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	}
	// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
	if (isAllOnesConstant(N1) &&
	(N0.getOpcode() == ISD::OR \|\| N0.getOpcode() == ISD::AND)) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	if (isa<ConstantSDNode>(RHS) \|\| isa<ConstantSDNode>(LHS)) {
	unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
	LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
	RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
	AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	}
	// fold (xor (and x, y), y) -> (and (not x), y)
	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
	N0->getOperand(1) == N1) {
	SDValue X = N0->getOperand(0);
	SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
	AddToWorklist(NotX.getNode());
	return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
	}

	// fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) &&
	TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
	if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
	if (C->getAPIntValue() == (OpSizeInBits - 1))
	return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0));
	}

	// fold (xor x, x) -> 0
	if (N0 == N1)
	return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);

	// fold (xor (shl 1, x), -1) -> (rotl ~1, x)
	// Here is a concrete example of this equivalence:
	// i16 x == 14
	// i16 shl == 1 << 14 == 16384 == 0b0100000000000000
	// i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
	//
	// =>
	//
	// i16 ~1 == 0b1111111111111110
	// i16 rol(~1, 14) == 0b1011111111111111
	//
	// Some additional tips to help conceptualize this transform:
	// - Try to see the operation as placing a single zero in a value of all ones.
	// - There exists no value for x which would allow the result to contain zero.
	// - Values of x larger than the bitwidth are undefined and do not require a
	// consistent result.
	// - Pushing the zero left requires shifting one bits in from the right.
	// A rotate left of ~1 is a nice way of achieving the desired result.
	if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL
	&& isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
	SDLoc DL(N);
	return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
	N0.getOperand(1));
	}

	// Simplify: xor (op x...), (op y...) -> (op (xor x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
	return Tmp;

	// Simplify the expression using non-local knowledge.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Handle transforms common to the three shifts, when the shift amount is a
	/// constant.
	SDValue DAGCombiner::visitShiftByConstant(SDNode N, ConstantSDNode Amt) {
	SDNode *LHS = N->getOperand(0).getNode();
	if (!LHS->hasOneUse()) return SDValue();

	// We want to pull some binops through shifts, so that we have (and (shift))
	// instead of (shift (and)), likewise for add, or, xor, etc. This sort of
	// thing happens with address calculations, so it's important to canonicalize
	// it.
	bool HighBitSet = false; // Can we transform this if the high bit is set?

	switch (LHS->getOpcode()) {
	default: return SDValue();
	case ISD::OR:
	case ISD::XOR:
	HighBitSet = false; // We can only transform sra if the high bit is clear.
	break;
	case ISD::AND:
	HighBitSet = true; // We can only transform sra if the high bit is set.
	break;
	case ISD::ADD:
	if (N->getOpcode() != ISD::SHL)
	return SDValue(); // only shl(add) not sr[al](add).
	HighBitSet = false; // We can only transform sra if the high bit is clear.
	break;
	}

	// We require the RHS of the binop to be a constant and not opaque as well.
	ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1));
	if (!BinOpCst) return SDValue();

	// FIXME: disable this unless the input to the binop is a shift by a constant
	// or is copy/select.Enable this in other cases when figure out it's exactly profitable.
	SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
	bool isShift = BinOpLHSVal->getOpcode() == ISD::SHL \|\|
	BinOpLHSVal->getOpcode() == ISD::SRA \|\|
	BinOpLHSVal->getOpcode() == ISD::SRL;
	bool isCopyOrSelect = BinOpLHSVal->getOpcode() == ISD::CopyFromReg \|\|
	BinOpLHSVal->getOpcode() == ISD::SELECT;

	if ((!isShift \|\| !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1))) &&
	!isCopyOrSelect)
	return SDValue();

	if (isCopyOrSelect && N->hasOneUse())
	return SDValue();

	EVT VT = N->getValueType(0);

	// If this is a signed shift right, and the high bit is modified by the
	// logical operation, do not perform the transformation. The highBitSet
	// boolean indicates the value of the high bit of the constant which would
	// cause it to be modified for this operation.
	if (N->getOpcode() == ISD::SRA) {
	bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
	if (BinOpRHSSignSet != HighBitSet)
	return SDValue();
	}

	if (!TLI.isDesirableToCommuteWithShift(LHS))
	return SDValue();

	// Fold the constants, shifting the binop RHS by the shift amount.
	SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
	N->getValueType(0),
	LHS->getOperand(1), N->getOperand(1));
	assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");

	// Create the new shift.
	SDValue NewShift = DAG.getNode(N->getOpcode(),
	SDLoc(LHS->getOperand(0)),
	VT, LHS->getOperand(0), N->getOperand(1));

	// Create the new binop.
	return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
	}

	SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
	assert(N->getOpcode() == ISD::TRUNCATE);
	assert(N->getOperand(0).getOpcode() == ISD::AND);

	// (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
	if (N->hasOneUse() && N->getOperand(0).hasOneUse()) {
	SDValue N01 = N->getOperand(0).getOperand(1);
	if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
	SDLoc DL(N);
	EVT TruncVT = N->getValueType(0);
	SDValue N00 = N->getOperand(0).getOperand(0);
	SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
	SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
	AddToWorklist(Trunc00.getNode());
	AddToWorklist(Trunc01.getNode());
	return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitRotate(SDNode *N) {
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	unsigned Bitsize = VT.getScalarSizeInBits();

	// fold (rot x, 0) -> x
	if (isNullConstantOrNullSplatConstant(N1))
	return N0;

	// fold (rot x, c) -> (rot x, c % BitSize)
	if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
	if (Cst->getAPIntValue().uge(Bitsize)) {
	uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
	return DAG.getNode(N->getOpcode(), dl, VT, N0,
	DAG.getConstant(RotAmt, dl, N1.getValueType()));
	}
	}

	// fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
	}

	unsigned NextOp = N0.getOpcode();
	// fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
	if (NextOp == ISD::ROTL \|\| NextOp == ISD::ROTR) {
	SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
	SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
	if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
	EVT ShiftVT = C1->getValueType(0);
	bool SameSide = (N->getOpcode() == NextOp);
	unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
	if (SDValue CombinedShift =
	DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
	SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
	SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
	ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
	BitsizeC.getNode());
	return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
	CombinedShiftNorm);
	}
	}
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitSHL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
	// If setcc produces all-one true value then:
	// (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
	if (N1CV && N1CV->isConstant()) {
	if (N0.getOpcode() == ISD::AND) {
	SDValue N00 = N0->getOperand(0);
	SDValue N01 = N0->getOperand(1);
	BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);

	if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
	TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT,
	N01CV, N1CV))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
	}
	}
	}
	}

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (shl c1, c2) -> c1<<c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
	// fold (shl 0, x) -> 0
	if (isNullConstantOrNullSplatConstant(N0))
	return N0;
	// fold (shl x, c >= size(x)) -> undef
	// NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
	auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
	return Val->getAPIntValue().uge(OpSizeInBits);
	};
	if (matchUnaryPredicate(N1, MatchShiftTooBig))
	return DAG.getUNDEF(VT);
	// fold (shl x, 0) -> x
	if (N1C && N1C->isNullValue())
	return N0;
	// fold (shl undef, x) -> 0
	if (N0.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// if (shl x, c) is known to be zero, return 0
	if (DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(OpSizeInBits)))
	return DAG.getConstant(0, SDLoc(N), VT);
	// fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
	}

	if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
	if (N0.getOpcode() == ISD::SHL) {
	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
	// For this to be valid, the second form must not preserve any of the bits
	// that are shifted out by the inner shift in the first form. This means
	// the outer shift size must be >= the number of bits added by the ext.
	// As a corollary, we don't care what kind of ext it is.
	if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND) &&
	N0.getOperand(0).getOpcode() == ISD::SHL) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
	APInt c1 = N0Op0C1->getAPIntValue();
	APInt c2 = N1C->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);

	EVT InnerShiftVT = N0Op0.getValueType();
	uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
	if (c2.uge(OpSizeInBits - InnerShiftSize)) {
	SDLoc DL(N0);
	APInt Sum = c1 + c2;
	if (Sum.uge(OpSizeInBits))
	return DAG.getConstant(0, DL, VT);

	return DAG.getNode(
	ISD::SHL, DL, VT,
	DAG.getNode(N0.getOpcode(), DL, VT, N0Op0->getOperand(0)),
	DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType()));
	}
	}
	}

	// fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
	// Only fold this if the inner zext has no other uses to avoid increasing
	// the total number of instructions.
	if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::SRL) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
	if (N0Op0C1->getAPIntValue().ult(VT.getScalarSizeInBits())) {
	uint64_t c1 = N0Op0C1->getZExtValue();
	uint64_t c2 = N1C->getZExtValue();
	if (c1 == c2) {
	SDValue NewOp0 = N0.getOperand(0);
	EVT CountVT = NewOp0.getOperand(1).getValueType();
	SDLoc DL(N);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(),
	NewOp0,
	DAG.getConstant(c2, DL, CountVT));
	AddToWorklist(NewSHL.getNode());
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
	}
	}
	}
	}

	// fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
	// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
	if (N1C && (N0.getOpcode() == ISD::SRL \|\| N0.getOpcode() == ISD::SRA) &&
	N0->getFlags().hasExact()) {
	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
	uint64_t C1 = N0C1->getZExtValue();
	uint64_t C2 = N1C->getZExtValue();
	SDLoc DL(N);
	if (C1 <= C2)
	return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
	DAG.getConstant(C2 - C1, DL, N1.getValueType()));
	return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
	DAG.getConstant(C1 - C2, DL, N1.getValueType()));
	}
	}

	// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
	// (and (srl x, (sub c1, c2), MASK)
	// Only fold this if the inner shift has no other uses -- if it does, folding
	// this will increase the total number of instructions.
	if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
	uint64_t c1 = N0C1->getZExtValue();
	if (c1 < OpSizeInBits) {
	uint64_t c2 = N1C->getZExtValue();
	APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
	SDValue Shift;
	if (c2 > c1) {
	Mask <<= c2 - c1;
	SDLoc DL(N);
	Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
	DAG.getConstant(c2 - c1, DL, N1.getValueType()));
	} else {
	Mask.lshrInPlace(c1 - c2);
	SDLoc DL(N);
	Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
	DAG.getConstant(c1 - c2, DL, N1.getValueType()));
	}
	SDLoc DL(N0);
	return DAG.getNode(ISD::AND, DL, VT, Shift,
	DAG.getConstant(Mask, DL, VT));
	}
	}
	}

	// fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
	if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
	isConstantOrConstantVector(N1, /* No Opaques */ true)) {
	SDLoc DL(N);
	SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
	SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
	}

	// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	// Variant of version done on multiply, except mul by a power of 2 is turned
	// into a shift.
	if ((N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR) &&
	N0.getNode()->hasOneUse() &&
	isConstantOrConstantVector(N1, /* No Opaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
	SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
	SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
	AddToWorklist(Shl0.getNode());
	AddToWorklist(Shl1.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
	}

	// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
	if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
	isConstantOrConstantVector(N1, /* No Opaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
	if (isConstantOrConstantVector(Shl))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
	}

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSHL = visitShiftByConstant(N, N1C))
	return NewSHL;

	return SDValue();
	}

	SDValue DAGCombiner::visitSRA(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// Arithmetic shifting an all-sign-bit value is a no-op.
	// fold (sra 0, x) -> 0
	// fold (sra -1, x) -> -1
	if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
	return N0;

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (sra c1, c2) -> (sra c1, c2)
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
	// fold (sra x, c >= size(x)) -> undef
	// NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
	auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
	return Val->getAPIntValue().uge(OpSizeInBits);
	};
	if (matchUnaryPredicate(N1, MatchShiftTooBig))
	return DAG.getUNDEF(VT);
	// fold (sra x, 0) -> x
	if (N1C && N1C->isNullValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
	// sext_inreg.
	if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
	unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
	EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
	if (VT.isVector())
	ExtVT = EVT::getVectorVT(*DAG.getContext(),
	ExtVT, VT.getVectorNumElements());
	if ((!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	N0.getOperand(0), DAG.getValueType(ExtVT));
	}

	// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
	if (N0.getOpcode() == ISD::SRA) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();

	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
	DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT));

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (sra (shl X, m), (sub result_size, n))
	// -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
	// result_size - n != m.
	// If truncate is free for the target sext(shl) is likely to result in better
	// code.
	if (N0.getOpcode() == ISD::SHL && N1C) {
	// Get the two constanst of the shifts, CN0 = m, CN = n.
	const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
	if (N01C) {
	LLVMContext &Ctx = *DAG.getContext();
	// Determine what the truncate's result bitsize and type would be.
	EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());

	if (VT.isVector())
	TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());

	// Determine the residual right-shift amount.
	int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();

	// If the shift is not a no-op (in which case this should be just a sign
	// extend already), the truncated to type is legal, sign_extend is legal
	// on that type, and the truncate to that type is both legal and free,
	// perform the transform.
	if ((ShiftAmt > 0) &&
	TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
	TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
	TLI.isTruncateFree(VT, TruncVT)) {
	SDLoc DL(N);
	SDValue Amt = DAG.getConstant(ShiftAmt, DL,
	getShiftAmountTy(N0.getOperand(0).getValueType()));
	SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
	N0.getOperand(0), Amt);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
	Shift);
	return DAG.getNode(ISD::SIGN_EXTEND, DL,
	N->getValueType(0), Trunc);
	}
	}
	}

	// fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
	}

	// fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
	// if c1 is equal to the number of bits the trunc removes
	if (N0.getOpcode() == ISD::TRUNCATE &&
	(N0.getOperand(0).getOpcode() == ISD::SRL \|\|
	N0.getOperand(0).getOpcode() == ISD::SRA) &&
	N0.getOperand(0).hasOneUse() &&
	N0.getOperand(0).getOperand(1).hasOneUse() &&
	N1C) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
	unsigned LargeShiftVal = LargeShift->getZExtValue();
	EVT LargeVT = N0Op0.getValueType();

	if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) {
	SDLoc DL(N);
	SDValue Amt =
	DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL,
	getShiftAmountTy(N0Op0.getOperand(0).getValueType()));
	SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT,
	N0Op0.getOperand(0), Amt);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
	}
	}
	}

	// Simplify, based on bits shifted out of the LHS.
	if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// If the sign bit is known to be zero, switch this to a SRL.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSRA = visitShiftByConstant(N, N1C))
	return NewSRA;

	return SDValue();
	}

	SDValue DAGCombiner::visitSRL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (srl c1, c2) -> c1 >>u c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
	// fold (srl 0, x) -> 0
	if (isNullConstantOrNullSplatConstant(N0))
	return N0;
	// fold (srl x, c >= size(x)) -> undef
	// NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
	auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
	return Val->getAPIntValue().uge(OpSizeInBits);
	};
	if (matchUnaryPredicate(N1, MatchShiftTooBig))
	return DAG.getUNDEF(VT);
	// fold (srl x, 0) -> x
	if (N1C && N1C->isNullValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// if (srl x, c) is known to be zero, return 0
	if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(OpSizeInBits)))
	return DAG.getConstant(0, SDLoc(N), VT);

	// fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
	if (N0.getOpcode() == ISD::SRL) {
	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2)))
	if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(0).getOpcode() == ISD::SRL) {
	if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) {
	uint64_t c1 = N001C->getZExtValue();
	uint64_t c2 = N1C->getZExtValue();
	EVT InnerShiftVT = N0.getOperand(0).getValueType();
	EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType();
	uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
	// This is only valid if the OpSizeInBits + c1 = size of inner shift.
	if (c1 + OpSizeInBits == InnerShiftSize) {
	SDLoc DL(N0);
	if (c1 + c2 >= InnerShiftSize)
	return DAG.getConstant(0, DL, VT);
	return DAG.getNode(ISD::TRUNCATE, DL, VT,
	DAG.getNode(ISD::SRL, DL, InnerShiftVT,
	N0.getOperand(0).getOperand(0),
	DAG.getConstant(c1 + c2, DL,
	ShiftCountVT)));
	}
	}
	}

	// fold (srl (shl x, c), c) -> (and x, cst2)
	if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
	SDLoc DL(N);
	SDValue Mask =
	DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
	AddToWorklist(Mask.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
	}

	// fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
	if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
	// Shifting in all undef bits?
	EVT SmallVT = N0.getOperand(0).getValueType();
	unsigned BitSize = SmallVT.getScalarSizeInBits();
	if (N1C->getZExtValue() >= BitSize)
	return DAG.getUNDEF(VT);

	if (!LegalTypes \|\| TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
	uint64_t ShiftAmt = N1C->getZExtValue();
	SDLoc DL0(N0);
	SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
	N0.getOperand(0),
	DAG.getConstant(ShiftAmt, DL0,
	getShiftAmountTy(SmallVT)));
	AddToWorklist(SmallShift.getNode());
	APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
	DAG.getConstant(Mask, DL, VT));
	}
	}

	// fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign
	// bit, which is unmodified by sra.
	if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) {
	if (N0.getOpcode() == ISD::SRA)
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
	}

	// fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
	if (N1C && N0.getOpcode() == ISD::CTLZ &&
	N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
	KnownBits Known;
	DAG.computeKnownBits(N0.getOperand(0), Known);

	// If any of the input bits are KnownOne, then the input couldn't be all
	// zeros, thus the result of the srl will always be zero.
	if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);

	// If all of the bits input the to ctlz node are known to be zero, then
	// the result of the ctlz is "32" and the result of the shift is one.
	APInt UnknownBits = ~Known.Zero;
	if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);

	// Otherwise, check to see if there is exactly one bit input to the ctlz.
	if (UnknownBits.isPowerOf2()) {
	// Okay, we know that only that the single bit specified by UnknownBits
	// could be set on input to the CTLZ node. If this bit is set, the SRL
	// will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
	// to an SRL/XOR pair, which is likely to simplify more.
	unsigned ShAmt = UnknownBits.countTrailingZeros();
	SDValue Op = N0.getOperand(0);

	if (ShAmt) {
	SDLoc DL(N0);
	Op = DAG.getNode(ISD::SRL, DL, VT, Op,
	DAG.getConstant(ShAmt, DL,
	getShiftAmountTy(Op.getValueType())));
	AddToWorklist(Op.getNode());
	}

	SDLoc DL(N);
	return DAG.getNode(ISD::XOR, DL, VT,
	Op, DAG.getConstant(1, DL, VT));
	}
	}

	// fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
	}

	// fold operands of srl based on knowledge that the low bits are not
	// demanded.
	if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSRL = visitShiftByConstant(N, N1C))
	return NewSRL;

	// Attempt to convert a srl of a load into a narrower zero-extending load.
	if (SDValue NarrowLoad = ReduceLoadWidth(N))
	return NarrowLoad;

	// Here is a common situation. We want to optimize:
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = srl i32 %b, 1
	// brcond i32 %c ...
	//
	// into
	//
	// %a = ...
	// %b = and %a, 2
	// %c = setcc eq %b, 0
	// brcond %c ...
	//
	// However when after the source operand of SRL is optimized into AND, the SRL
	// itself may not be optimized further. Look for it and add the BRCOND into
	// the worklist.
	if (N->hasOneUse()) {
	SDNode Use = N->use_begin();
	if (Use->getOpcode() == ISD::BRCOND)
	AddToWorklist(Use);
	else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
	// Also look pass the truncate.
	Use = *Use->use_begin();
	if (Use->getOpcode() == ISD::BRCOND)
	AddToWorklist(Use);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitABS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (abs c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
	// fold (abs (abs x)) -> (abs x)
	if (N0.getOpcode() == ISD::ABS)
	return N0;
	// fold (abs x) -> x iff not-negative
	if (DAG.SignBitIsZero(N0))
	return N0;
	return SDValue();
	}

	SDValue DAGCombiner::visitBSWAP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (bswap c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
	// fold (bswap (bswap x)) -> x
	if (N0.getOpcode() == ISD::BSWAP)
	return N0->getOperand(0);
	return SDValue();
	}

	SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (bitreverse c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
	// fold (bitreverse (bitreverse x)) -> x
	if (N0.getOpcode() == ISD::BITREVERSE)
	return N0.getOperand(0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTLZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctlz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctlz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTTZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (cttz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (cttz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTPOP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctpop c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
	return SDValue();
	}

	/// \brief Generate Min/Max node
	static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
	SDValue RHS, SDValue True, SDValue False,
	ISD::CondCode CC, const TargetLowering &TLI,
	SelectionDAG &DAG) {
	if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
	return SDValue();

	switch (CC) {
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETLT:
	case ISD::SETLE:
	case ISD::SETULT:
	case ISD::SETULE: {
	unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
	if (TLI.isOperationLegal(Opcode, VT))
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	return SDValue();
	}
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETGT:
	case ISD::SETGE:
	case ISD::SETUGT:
	case ISD::SETUGE: {
	unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
	if (TLI.isOperationLegal(Opcode, VT))
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	return SDValue();
	}
	default:
	return SDValue();
	}
	}

	SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);

	if (!VT.isInteger())
	return SDValue();

	auto *C1 = dyn_cast<ConstantSDNode>(N1);
	auto *C2 = dyn_cast<ConstantSDNode>(N2);
	if (!C1 \|\| !C2)
	return SDValue();

	// Only do this before legalization to avoid conflicting with target-specific
	// transforms in the other direction (create a select from a zext/sext). There
	// is also a target-independent combine here in DAGCombiner in the other
	// direction for (select Cond, -1, 0) when the condition is not i1.
	if (CondVT == MVT::i1 && !LegalOperations) {
	if (C1->isNullValue() && C2->isOne()) {
	// select Cond, 0, 1 --> zext (!Cond)
	SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
	if (VT != MVT::i1)
	NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
	return NotCond;
	}
	if (C1->isNullValue() && C2->isAllOnesValue()) {
	// select Cond, 0, -1 --> sext (!Cond)
	SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
	if (VT != MVT::i1)
	NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
	return NotCond;
	}
	if (C1->isOne() && C2->isNullValue()) {
	// select Cond, 1, 0 --> zext (Cond)
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	return Cond;
	}
	if (C1->isAllOnesValue() && C2->isNullValue()) {
	// select Cond, -1, 0 --> sext (Cond)
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return Cond;
	}

	// For any constants that differ by 1, we can transform the select into an
	// extend and add. Use a target hook because some targets may prefer to
	// transform in the other direction.
	if (TLI.convertSelectOfConstantsToMath(VT)) {
	if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
	// select Cond, C1, C1-1 --> add (zext Cond), C1-1
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
	}
	if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
	// select Cond, C1, C1+1 --> add (sext Cond), C1+1
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
	}
	}

	return SDValue();
	}

	// fold (select Cond, 0, 1) -> (xor Cond, 1)
	// We can't do this reliably if integer based booleans have different contents
	// to floating point based booleans. This is because we can't tell whether we
	// have an integer-based boolean or a floating-point-based boolean unless we
	// can find the SETCC that produced it and inspect its operands. This is
	// fairly easy if C is the SETCC node, but it can potentially be
	// undiscoverable (or not reasonably discoverable). For example, it could be
	// in another basic block or it could require searching a complicated
	// expression.
	if (CondVT.isInteger() &&
	TLI.getBooleanContents(false, true) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	TLI.getBooleanContents(false, false) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	C1->isNullValue() && C2->isOne()) {
	SDValue NotCond =
	DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
	if (VT.bitsEq(CondVT))
	return NotCond;
	return DAG.getZExtOrTrunc(NotCond, DL, VT);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSELECT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	EVT VT0 = N0.getValueType();
	SDLoc DL(N);

	// fold (select C, X, X) -> X
	if (N1 == N2)
	return N1;

	if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) {
	// fold (select true, X, Y) -> X
	// fold (select false, X, Y) -> Y
	return !N0C->isNullValue() ? N1 : N2;
	}

	// fold (select X, X, Y) -> (or X, Y)
	// fold (select X, 1, Y) -> (or C, Y)
	if (VT == VT0 && VT == MVT::i1 && (N0 == N1 \|\| isOneConstant(N1)))
	return DAG.getNode(ISD::OR, DL, VT, N0, N2);

	if (SDValue V = foldSelectOfConstants(N))
	return V;

	// fold (select C, 0, X) -> (and (not C), X)
	if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
	SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
	AddToWorklist(NOTNode.getNode());
	return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2);
	}
	// fold (select C, X, 1) -> (or (not C), X)
	if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
	SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
	AddToWorklist(NOTNode.getNode());
	return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1);
	}
	// fold (select X, Y, X) -> (and X, Y)
	// fold (select X, Y, 0) -> (and X, Y)
	if (VT == VT0 && VT == MVT::i1 && (N0 == N2 \|\| isNullConstant(N2)))
	return DAG.getNode(ISD::AND, DL, VT, N0, N1);

	// If we can fold this based on the true/false value, do so.
	if (SimplifySelectOps(N, N1, N2))
	return SDValue(N, 0); // Don't revisit N.

	if (VT0 == MVT::i1) {
	// The code in this block deals with the following 2 equivalences:
	// select(C0\|C1, x, y) <=> select(C0, x, select(C1, x, y))
	// select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
	// The target can specify its preferred form with the
	// shouldNormalizeToSelectSequence() callback. However we always transform
	// to the right anyway if we find the inner select exists in the DAG anyway
	// and we always transform to the left side if we know that we can further
	// optimize the combination of the conditions.
	bool normalizeToSequence =
	TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
	// select (and Cond0, Cond1), X, Y
	// -> select Cond0, (select Cond1, X, Y), Y
	if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
	SDValue Cond0 = N0->getOperand(0);
	SDValue Cond1 = N0->getOperand(1);
	SDValue InnerSelect =
	DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
	if (normalizeToSequence \|\| !InnerSelect.use_empty())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
	InnerSelect, N2);
	}
	// select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
	if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
	SDValue Cond0 = N0->getOperand(0);
	SDValue Cond1 = N0->getOperand(1);
	SDValue InnerSelect =
	DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
	if (normalizeToSequence \|\| !InnerSelect.use_empty())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
	InnerSelect);
	}

	// select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
	if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
	SDValue N1_0 = N1->getOperand(0);
	SDValue N1_1 = N1->getOperand(1);
	SDValue N1_2 = N1->getOperand(2);
	if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
	// Create the actual and node if we can generate good code for it.
	if (!normalizeToSequence) {
	SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2);
	}
	// Otherwise see if we can optimize the "and" to a better pattern.
	if (SDValue Combined = visitANDLike(N0, N1_0, N))
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
	N2);
	}
	}
	// select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
	if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
	SDValue N2_0 = N2->getOperand(0);
	SDValue N2_1 = N2->getOperand(1);
	SDValue N2_2 = N2->getOperand(2);
	if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
	// Create the actual or node if we can generate good code for it.
	if (!normalizeToSequence) {
	SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2);
	}
	// Otherwise see if we can optimize to a better pattern.
	if (SDValue Combined = visitORLike(N0, N2_0, N))
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
	N2_2);
	}
	}
	}

	// select (xor Cond, 1), X, Y -> select Cond, Y, X
	if (VT0 == MVT::i1) {
	if (N0->getOpcode() == ISD::XOR) {
	if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) {
	SDValue Cond0 = N0->getOperand(0);
	if (C->isOne())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N2, N1);
	}
	}
	}

	// fold selects based on a setcc into other things, such as min/max/abs
	if (N0.getOpcode() == ISD::SETCC) {
	// select x, y (fcmp lt x, y) -> fminnum x, y
	// select x, y (fcmp gt x, y) -> fmaxnum x, y
	//
	// This is OK if we don't care about what happens if either operand is a
	// NaN.
	//

	// FIXME: Instead of testing for UnsafeFPMath, this should be checking for
	// no signed zeros as well as no nans.
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() &&
	DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

	if (SDValue FMinMax = combineMinNumMaxNum(
	DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG))
	return FMinMax;
	}

	if ((!LegalOperations &&
	TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) \|\|
	TLI.isOperationLegal(ISD::SELECT_CC, VT))
	return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0),
	N0.getOperand(1), N1, N2, N0.getOperand(2));
	return SimplifySelect(DL, N0, N1, N2);
	}

	return SDValue();
	}

	static
	std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// Split the inputs.
	SDValue Lo, Hi, LL, LH, RL, RH;
	std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
	std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);

	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));

	return std::make_pair(Lo, Hi);
	}

	// This function assumes all the vselect's arguments are CONCAT_VECTOR
	// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
	static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = N->getValueType(0);
	int NumElems = VT.getVectorNumElements();
	assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
	RHS.getOpcode() == ISD::CONCAT_VECTORS &&
	Cond.getOpcode() == ISD::BUILD_VECTOR);

	// CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
	// binary ones here.
	if (LHS->getNumOperands() != 2 \|\| RHS->getNumOperands() != 2)
	return SDValue();

	// We're sure we have an even number of elements due to the
	// concat_vectors we have as arguments to vselect.
	// Skip BV elements until we find one that's not an UNDEF
	// After we find an UNDEF element, keep looping until we get to half the
	// length of the BV and see if all the non-undef nodes are the same.
	ConstantSDNode *BottomHalf = nullptr;
	for (int i = 0; i < NumElems / 2; ++i) {
	if (Cond->getOperand(i)->isUndef())
	continue;

	if (BottomHalf == nullptr)
	BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
	else if (Cond->getOperand(i).getNode() != BottomHalf)
	return SDValue();
	}

	// Do the same for the second half of the BuildVector
	ConstantSDNode *TopHalf = nullptr;
	for (int i = NumElems / 2; i < NumElems; ++i) {
	if (Cond->getOperand(i)->isUndef())
	continue;

	if (TopHalf == nullptr)
	TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
	else if (Cond->getOperand(i).getNode() != TopHalf)
	return SDValue();
	}

	assert(TopHalf && BottomHalf &&
	"One half of the selector was all UNDEFs and the other was all the "
	"same value. This should have been addressed before this function.");
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
	TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
	}

	SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
	SDValue Mask = MSC->getMask();
	SDValue Data = MSC->getValue();
	SDLoc DL(N);

	// If the MSCATTER data type requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).
	if (Mask.getOpcode() != ISD::SETCC)
	return SDValue();

	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
	TargetLowering::TypeSplitVector)
	return SDValue();
	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));

	SDValue Chain = MSC->getChain();

	EVT MemoryVT = MSC->getMemoryVT();
	unsigned Alignment = MSC->getOriginalAlignment();

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue DataLo, DataHi;
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	SDValue BasePtr = MSC->getBasePtr();
	SDValue IndexLo, IndexHi;
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MSC->getPointerInfo(),
	MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, MSC->getAAInfo(), MSC->getRanges());

	SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo };
	Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
	DL, OpsLo, MMO);

	SDValue OpsHi[] = {Chain, DataHi, MaskHi, BasePtr, IndexHi};
	Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
	DL, OpsHi, MMO);

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	SDValue DAGCombiner::visitMSTORE(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
	SDValue Mask = MST->getMask();
	SDValue Data = MST->getValue();
	EVT VT = Data.getValueType();
	SDLoc DL(N);

	// If the MSTORE data type requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).
	if (Mask.getOpcode() == ISD::SETCC) {
	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), VT) !=
	TargetLowering::TypeSplitVector)
	return SDValue();

	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	SDValue Chain = MST->getChain();
	SDValue Ptr = MST->getBasePtr();

	EVT MemoryVT = MST->getMemoryVT();
	unsigned Alignment = MST->getOriginalAlignment();

	// if Alignment is equal to the vector size,
	// take the half of it for the second part
	unsigned SecondHalfAlignment =
	(Alignment == VT.getSizeInBits() / 8) ? Alignment / 2 : Alignment;

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue DataLo, DataHi;
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MST->getPointerInfo(),
	MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, MST->getAAInfo(), MST->getRanges());

	Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
	MST->isTruncatingStore(),
	MST->isCompressingStore());

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
	MST->isCompressingStore());

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(MST->getPointerInfo(),
	MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
	SecondHalfAlignment, MST->getAAInfo(),
	MST->getRanges());

	Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
	MST->isTruncatingStore(),
	MST->isCompressingStore());

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitMGATHER(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
	SDValue Mask = MGT->getMask();
	SDLoc DL(N);

	// If the MGATHER result requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).

	if (Mask.getOpcode() != ISD::SETCC)
	return SDValue();

	EVT VT = N->getValueType(0);

	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), VT) !=
	TargetLowering::TypeSplitVector)
	return SDValue();

	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	SDValue Src0 = MGT->getValue();
	SDValue Src0Lo, Src0Hi;
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

	SDValue Chain = MGT->getChain();
	EVT MemoryVT = MGT->getMemoryVT();
	unsigned Alignment = MGT->getOriginalAlignment();

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue BasePtr = MGT->getBasePtr();
	SDValue Index = MGT->getIndex();
	SDValue IndexLo, IndexHi;
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MGT->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MGT->getAAInfo(), MGT->getRanges());

	SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo };
	Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
	MMO);

	SDValue OpsHi[] = {Chain, Src0Hi, MaskHi, BasePtr, IndexHi};
	Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
	MMO);

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	// Build a factor node to remember that this load is independent of the
	// other one.
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalized the chain result - switch anything that used the old chain to
	// use the new one.
	DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);

	SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

	SDValue RetOps[] = { GatherRes, Chain };
	return DAG.getMergeValues(RetOps, DL);
	}

	SDValue DAGCombiner::visitMLOAD(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
	SDValue Mask = MLD->getMask();
	SDLoc DL(N);

	// If the MLOAD result requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).
	if (Mask.getOpcode() == ISD::SETCC) {
	EVT VT = N->getValueType(0);

	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), VT) !=
	TargetLowering::TypeSplitVector)
	return SDValue();

	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	SDValue Src0 = MLD->getSrc0();
	SDValue Src0Lo, Src0Hi;
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));

	SDValue Chain = MLD->getChain();
	SDValue Ptr = MLD->getBasePtr();
	EVT MemoryVT = MLD->getMemoryVT();
	unsigned Alignment = MLD->getOriginalAlignment();

	// if Alignment is equal to the vector size,
	// take the half of it for the second part
	unsigned SecondHalfAlignment =
	(Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
	Alignment/2 : Alignment;

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MLD->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MLD->getAAInfo(), MLD->getRanges());

	Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
	ISD::NON_EXTLOAD, MLD->isExpandingLoad());

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
	MLD->isExpandingLoad());

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(MLD->getPointerInfo(),
	MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
	SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());

	Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
	ISD::NON_EXTLOAD, MLD->isExpandingLoad());

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	// Build a factor node to remember that this load is independent of the
	// other one.
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalized the chain result - switch anything that used the old chain to
	// use the new one.
	DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);

	SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

	SDValue RetOps[] = { LoadRes, Chain };
	return DAG.getMergeValues(RetOps, DL);
	}
	return SDValue();
	}

	/// A vector select of 2 constant vectors can be simplified to math/logic to
	/// avoid a variable select instruction and possibly avoid constant loads.
	SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	if (!Cond.hasOneUse() \|\| Cond.getScalarValueSizeInBits() != 1 \|\|
	!TLI.convertSelectOfConstantsToMath(VT) \|\|
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) \|\|
	!ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
	return SDValue();

	// Check if we can use the condition value to increment/decrement a single
	// constant value. This simplifies a select to an add and removes a constant
	// load/materialization from the general case.
	bool AllAddOne = true;
	bool AllSubOne = true;
	unsigned Elts = VT.getVectorNumElements();
	for (unsigned i = 0; i != Elts; ++i) {
	SDValue N1Elt = N1.getOperand(i);
	SDValue N2Elt = N2.getOperand(i);
	if (N1Elt.isUndef() \|\| N2Elt.isUndef())
	continue;

	const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
	const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
	if (C1 != C2 + 1)
	AllAddOne = false;
	if (C1 != C2 - 1)
	AllSubOne = false;
	}

	// Further simplifications for the extra-special cases where the constants are
	// all 0 or all -1 should be implemented as folds of these patterns.
	SDLoc DL(N);
	if (AllAddOne \|\| AllSubOne) {
	// vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
	// vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
	auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
	}

	// The general case for select-of-constants:
	// vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
	// ...but that only makes sense if a vselect is slower than 2 logic ops, so
	// leave that to a machine-specific pass.
	return SDValue();
	}

	SDValue DAGCombiner::visitVSELECT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	SDLoc DL(N);

	// fold (vselect C, X, X) -> X
	if (N1 == N2)
	return N1;

	// Canonicalize integer abs.
	// vselect (setg[te] X, 0), X, -X ->
	// vselect (setgt X, -1), X, -X ->
	// vselect (setl[te] X, 0), -X, X ->
	// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
	if (N0.getOpcode() == ISD::SETCC) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	bool isAbs = false;
	bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (((RHSIsAllZeros && (CC == ISD::SETGT \|\| CC == ISD::SETGE)) \|\|
	(ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
	N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
	isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
	else if ((RHSIsAllZeros && (CC == ISD::SETLT \|\| CC == ISD::SETLE)) &&
	N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
	isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());

	if (isAbs) {
	EVT VT = LHS.getValueType();
	if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
	return DAG.getNode(ISD::ABS, DL, VT, LHS);

	SDValue Shift = DAG.getNode(
	ISD::SRA, DL, VT, LHS,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
	AddToWorklist(Shift.getNode());
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
	}
	}

	if (SimplifySelectOps(N, N1, N2))
	return SDValue(N, 0); // Don't revisit N.

	// Fold (vselect (build_vector all_ones), N1, N2) -> N1
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	return N1;
	// Fold (vselect (build_vector all_zeros), N1, N2) -> N2
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N2;

	// The ConvertSelectToConcatVector function is assuming both the above
	// checks for (vselect (build_vector all{ones,zeros) ...) have been made
	// and addressed.
	if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N2.getOpcode() == ISD::CONCAT_VECTORS &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
	return CV;
	}

	if (SDValue V = foldVSelectOfConstants(N))
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	SDValue N3 = N->getOperand(3);
	SDValue N4 = N->getOperand(4);
	ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();

	// fold select_cc lhs, rhs, x, x, cc -> x
	if (N2 == N3)
	return N2;

	// Determine if the condition we're dealing with is constant
	if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
	CC, SDLoc(N), false)) {
	AddToWorklist(SCC.getNode());

	if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
	if (!SCCC->isNullValue())
	return N2; // cond always true -> true val
	else
	return N3; // cond always false -> false val
	} else if (SCC->isUndef()) {
	// When the condition is UNDEF, just return the first operand. This is
	// coherent the DAG creation, no setcc node is created in this case
	return N2;
	} else if (SCC.getOpcode() == ISD::SETCC) {
	// Fold to a simpler select_cc
	return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
	SCC.getOperand(0), SCC.getOperand(1), N2, N3,
	SCC.getOperand(2));
	}
	}

	// If we can fold this based on the true/false value, do so.
	if (SimplifySelectOps(N, N2, N3))
	return SDValue(N, 0); // Don't revisit N.

	// fold select_cc into other things, such as min/max/abs
	return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
	}

	SDValue DAGCombiner::visitSETCC(SDNode *N) {
	return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
	cast<CondCodeSDNode>(N->getOperand(2))->get(),
	SDLoc(N));
	}

	SDValue DAGCombiner::visitSETCCE(SDNode *N) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Carry = N->getOperand(2);
	SDValue Cond = N->getOperand(3);

	// If Carry is false, fold to a regular SETCC.
	if (Carry.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);

	return SDValue();
	}

	SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Carry = N->getOperand(2);
	SDValue Cond = N->getOperand(3);

	// If Carry is false, fold to a regular SETCC.
	if (isNullConstant(Carry))
	return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);

	return SDValue();
	}

	/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
	/// a build_vector of constants.
	/// This function is called by the DAGCombiner when visiting sext/zext/aext
	/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
	/// Vector extends are not folded if operations are legal; this is to
	/// avoid introducing illegal build_vector dag nodes.
	static SDNode tryToFoldExtendOfConstant(SDNode N, const TargetLowering &TLI,
	SelectionDAG &DAG, bool LegalTypes,
	bool LegalOperations) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	assert((Opcode == ISD::SIGN_EXTEND \|\| Opcode == ISD::ZERO_EXTEND \|\|
	Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
	&& "Expected EXTEND dag node in input!");

	// fold (sext c1) -> c1
	// fold (zext c1) -> c1
	// fold (aext c1) -> c1
	if (isa<ConstantSDNode>(N0))
	return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode();

	// fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
	// fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
	// fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
	EVT SVT = VT.getScalarType();
	if (!(VT.isVector() &&
	(!LegalTypes \|\| (!LegalOperations && TLI.isTypeLegal(SVT))) &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
	return nullptr;

	// We can fold this node into a build_vector.
	unsigned VTBits = SVT.getSizeInBits();
	unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = VT.getVectorNumElements();
	SDLoc DL(N);

	for (unsigned i=0; i != NumElts; ++i) {
	SDValue Op = N0->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}

	SDLoc DL(Op);
	// Get the constant value and if needed trunc it to the size of the type.
	// Nodes like build_vector might have constants wider than the scalar type.
	APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
	if (Opcode == ISD::SIGN_EXTEND \|\| Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
	Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
	else
	Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
	}

	return DAG.getBuildVector(VT, DL, Elts).getNode();
	}

	// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
	// "fold ({s\|z\|a}ext (load x)) -> ({s\|z\|a}ext (truncate ({s\|z\|a}extload x)))"
	// transformation. Returns true if extension are possible and the above
	// mentioned transformation is profitable.
	static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
	unsigned ExtOpc,
	SmallVectorImpl<SDNode *> &ExtendNodes,
	const TargetLowering &TLI) {
	bool HasCopyToRegUses = false;
	bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
	for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
	UE = N0.getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User == N)
	continue;
	if (UI.getUse().getResNo() != N0.getResNo())
	continue;
	// FIXME: Only extend SETCC N, N and SETCC N, c for now.
	if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
	ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
	if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
	// Sign bits will be lost after a zext.
	return false;
	bool Add = false;
	for (unsigned i = 0; i != 2; ++i) {
	SDValue UseOp = User->getOperand(i);
	if (UseOp == N0)
	continue;
	if (!isa<ConstantSDNode>(UseOp))
	return false;
	Add = true;
	}
	if (Add)
	ExtendNodes.push_back(User);
	continue;
	}
	// If truncates aren't free and there are users we can't
	// extend, it isn't worthwhile.
	if (!isTruncFree)
	return false;
	// Remember if this value is live-out.
	if (User->getOpcode() == ISD::CopyToReg)
	HasCopyToRegUses = true;
	}

	if (HasCopyToRegUses) {
	bool BothLiveOut = false;
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	SDUse &Use = UI.getUse();
	if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
	BothLiveOut = true;
	break;
	}
	}
	if (BothLiveOut)
	// Both unextended and extended values are live out. There had better be
	// a good reason for the transformation.
	return ExtendNodes.size();
	}
	return true;
	}

	void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
	SDValue Trunc, SDValue ExtLoad,
	const SDLoc &DL, ISD::NodeType ExtType) {
	// Extend SetCC uses if necessary.
	for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
	SDNode *SetCC = SetCCs[i];
	SmallVector<SDValue, 4> Ops;

	for (unsigned j = 0; j != 2; ++j) {
	SDValue SOp = SetCC->getOperand(j);
	if (SOp == Trunc)
	Ops.push_back(ExtLoad);
	else
	Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
	}

	Ops.push_back(SetCC->getOperand(2));
	CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
	}
	}

	// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
	SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	assert((N->getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND) &&
	"Unexpected node type (not an extend)!");

	// fold (sext (load x)) to multiple smaller sextloads; same for zext.
	// For example, on a target with legal v4i32, but illegal v8i32, turn:
	// (v8i32 (sext (v8i16 (load x))))
	// into:
	// (v8i32 (concat_vectors (v4i32 (sextload x)),
	// (v4i32 (sextload (x + 16)))))
	// Where uses of the original load, i.e.:
	// (v8i16 (load x))
	// are replaced with:
	// (v8i16 (truncate
	// (v8i32 (concat_vectors (v4i32 (sextload x)),
	// (v4i32 (sextload (x + 16)))))))
	//
	// This combine is only applicable to illegal, but splittable, vectors.
	// All legal types, and illegal non-vector types, are handled elsewhere.
	// This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
	//
	if (N0->getOpcode() != ISD::LOAD)
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);

	if (!ISD::isNON_EXTLoad(LN0) \|\| !ISD::isUNINDEXEDLoad(LN0) \|\|
	!N0.hasOneUse() \|\| LN0->isVolatile() \|\| !DstVT.isVector() \|\|
	!DstVT.isPow2VectorType() \|\| !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
	return SDValue();

	SmallVector<SDNode *, 4> SetCCs;
	if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
	return SDValue();

	ISD::LoadExtType ExtType =
	N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;

	// Try to split the vector types to get down to legal types.
	EVT SplitSrcVT = SrcVT;
	EVT SplitDstVT = DstVT;
	while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
	SplitSrcVT.getVectorNumElements() > 1) {
	SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
	SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
	}

	if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
	return SDValue();

	SDLoc DL(N);
	const unsigned NumSplits =
	DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
	const unsigned Stride = SplitSrcVT.getStoreSize();
	SmallVector<SDValue, 4> Loads;
	SmallVector<SDValue, 4> Chains;

	SDValue BasePtr = LN0->getBasePtr();
	for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
	const unsigned Offset = Idx * Stride;
	const unsigned Align = MinAlign(LN0->getAlignment(), Offset);

	SDValue SplitLoad = DAG.getExtLoad(
	ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
	LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());

	BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, DL, BasePtr.getValueType()));

	Loads.push_back(SplitLoad.getValue(0));
	Chains.push_back(SplitLoad.getValue(1));
	}

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);

	// Simplify TF.
	AddToWorklist(NewChain.getNode());

	CombineTo(N, NewValue);

	// Replace uses of the original load (before extension)
	// with a truncate of the concatenated sextloaded vectors.
	SDValue Trunc =
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
	CombineTo(N0.getNode(), Trunc, NewChain);
	ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
	(ISD::NodeType)N->getOpcode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	/// If we're narrowing or widening the result of a vector select and the final
	/// size is the same size as a setcc (compare) feeding the select, then try to
	/// apply the cast operation to the select's operands because matching vector
	/// sizes for a select condition and other operands should be more efficient.
	SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
	unsigned CastOpcode = Cast->getOpcode();
	assert((CastOpcode == ISD::SIGN_EXTEND \|\| CastOpcode == ISD::ZERO_EXTEND \|\|
	CastOpcode == ISD::TRUNCATE \|\| CastOpcode == ISD::FP_EXTEND \|\|
	CastOpcode == ISD::FP_ROUND) &&
	"Unexpected opcode for vector select narrowing/widening");

	// We only do this transform before legal ops because the pattern may be
	// obfuscated by target-specific operations after legalization. Do not create
	// an illegal select op, however, because that may be difficult to lower.
	EVT VT = Cast->getValueType(0);
	if (LegalOperations \|\| !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();

	SDValue VSel = Cast->getOperand(0);
	if (VSel.getOpcode() != ISD::VSELECT \|\| !VSel.hasOneUse() \|\|
	VSel.getOperand(0).getOpcode() != ISD::SETCC)
	return SDValue();

	// Does the setcc have the same vector size as the casted select?
	SDValue SetCC = VSel.getOperand(0);
	EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
	if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
	SDValue A = VSel.getOperand(1);
	SDValue B = VSel.getOperand(2);
	SDValue CastA, CastB;
	SDLoc DL(Cast);
	if (CastOpcode == ISD::FP_ROUND) {
	// FP_ROUND (fptrunc) has an extra flag operand to pass along.
	CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
	CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
	} else {
	CastA = DAG.getNode(CastOpcode, DL, VT, A);
	CastB = DAG.getNode(CastOpcode, DL, VT, B);
	}
	return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
	}

	SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	// fold (sext (sext x)) -> (sext x)
	// fold (sext (aext x)) -> (sext x)
	if (N0.getOpcode() == ISD::SIGN_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));

	if (N0.getOpcode() == ISD::TRUNCATE) {
	// fold (sext (truncate (load x))) -> (sext (smaller load x))
	// fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// See if the value being truncated is already sign extended. If so, just
	// eliminate the trunc/sext pair.
	SDValue Op = N0.getOperand(0);
	unsigned OpBits = Op.getScalarValueSizeInBits();
	unsigned MidBits = N0.getScalarValueSizeInBits();
	unsigned DestBits = VT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op);

	if (OpBits == DestBits) {
	// Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign
	// bits, it is already ready.
	if (NumSignBits > DestBits-MidBits)
	return Op;
	} else if (OpBits < DestBits) {
	// Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign
	// bits, just sext from i32.
	if (NumSignBits > OpBits-MidBits)
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
	} else {
	// Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign
	// bits, just truncate to i32.
	if (NumSignBits > OpBits-MidBits)
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	// fold (sext (truncate x)) -> (sextinreg x).
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
	N0.getValueType())) {
	if (OpBits < DestBits)
	Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
	else if (OpBits > DestBits)
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
	DAG.getValueType(N0.getValueType()));
	}
	}

	// fold (sext (load x)) -> (sext (truncate (sextload x)))
	// Only generate vector extloads when 1) they're legal, and 2) they are
	// deemed desirable by the target.
	if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	((!LegalOperations && !VT.isVector() &&
	!cast<LoadSDNode>(N0)->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
	if (VT.isVector())
	DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	// fold (sext (load x)) to multiple smaller sextloads.
	// Only on illegal but splittable vectors.
	if (SDValue ExtLoad = CombineExtLoad(N))
	return ExtLoad;

	// fold (sext (sextload x)) -> (sext (truncate (sextload x)))
	// fold (sext ( extload x)) -> (sext (truncate (sextload x)))
	if ((ISD::isSEXTLoad(N0.getNode()) \|\| ISD::isEXTLoad(N0.getNode())) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	if ((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
	LN0->getBasePtr(), MemVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (sext (and/or/xor (load x), cst)) ->
	// (and/or/xor (sextload x), (sext cst))
	if ((N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
	(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
	if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
	SetCCs, TLI);
	if (DoXform) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.sext(VT.getSizeInBits());
	SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
	ExtLoad, DAG.getConstant(Mask, DL, VT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
	SDLoc(N0.getOperand(0)),
	N0.getOperand(0).getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
	bool NoReplaceTruncAnd = !N0.hasOneUse();
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, And);
	// If N0 has multiple uses, change other uses as well.
	if (NoReplaceTruncAnd) {
	SDValue TruncAnd =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
	CombineTo(N0.getNode(), TruncAnd);
	}
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	EVT N00VT = N0.getOperand(0).getValueType();

	// sext(setcc) -> sext_in_reg(vsetcc) for vectors.
	// Only do this before legalize for now.
	if (VT.isVector() && !LegalOperations &&
	TLI.getBooleanContents(N00VT) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	// On some architectures (such as SSE/NEON/etc) the SETCC result type is
	// of the same size as the compared operands. Only optimize sext(setcc())
	// if this is the case.
	EVT SVT = getSetCCResultType(N00VT);

	// We know that the # elements of the results is the same as the
	// # elements of the compare (and the # elements of the compare result
	// for that matter). Check to see that they are the same size. If so,
	// we know that the element size of the sext'd result matches the
	// element size of the compare operands.
	if (VT.getSizeInBits() == SVT.getSizeInBits())
	return DAG.getSetCC(DL, VT, N00, N01, CC);

	// If the desired elements are smaller or larger than the source
	// elements, we can use a matching integer vector type and then
	// truncate/sign extend.
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (SVT == MatchingVecType) {
	SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
	return DAG.getSExtOrTrunc(VsetCC, DL, VT);
	}
	}

	// sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
	// Here, T can be 1 or -1, depending on the type of the setcc and
	// getBooleanContents().
	unsigned SetCCWidth = N0.getScalarValueSizeInBits();

	// To determine the "true" side of the select, we need to know the high bit
	// of the value returned by the setcc if it evaluates to true.
	// If the type of the setcc is i1, then the true case of the select is just
	// sext(i1 1), that is, -1.
	// If the type of the setcc is larger (say, i8) then the value of the high
	// bit depends on getBooleanContents(), so ask TLI for a real "true" value
	// of the appropriate width.
	SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT)
	: TLI.getConstTrueVal(DAG, VT, DL);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	if (SDValue SCC =
	SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
	return SCC;

	if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
	EVT SetCCVT = getSetCCResultType(N00VT);
	// Don't do this transform for i1 because there's a select transform
	// that would reverse it.
	// TODO: We should not do this transform at all without a target hook
	// because a sext is likely cheaper than a select?
	if (SetCCVT.getScalarSizeInBits() != 1 &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SETCC, N00VT))) {
	SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
	return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
	}
	}
	}

	// fold (sext x) -> (zext x) if the sign bit is known zero.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
	DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	// isTruncateOf - If N is a truncate of some other value, return true, record
	// the value being truncated in Op and which of Op's bits are zero/one in Known.
	// This function computes KnownBits to avoid a duplicated call to
	// computeKnownBits in the caller.
	static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
	KnownBits &Known) {
	if (N->getOpcode() == ISD::TRUNCATE) {
	Op = N->getOperand(0);
	DAG.computeKnownBits(Op, Known);
	return true;
	}

	if (N->getOpcode() != ISD::SETCC \|\| N->getValueType(0) != MVT::i1 \|\|
	cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
	return false;

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	assert(Op0.getValueType() == Op1.getValueType());

	if (isNullConstant(Op0))
	Op = Op1;
	else if (isNullConstant(Op1))
	Op = Op0;
	else
	return false;

	DAG.computeKnownBits(Op, Known);

	if (!(Known.Zero \| 1).isAllOnesValue())
	return false;

	return true;
	}

	SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	// fold (zext (zext x)) -> (zext x)
	// fold (zext (aext x)) -> (zext x)
	if (N0.getOpcode() == ISD::ZERO_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
	N0.getOperand(0));

	// fold (zext (truncate x)) -> (zext x) or
	// (zext (truncate x)) -> (truncate x)
	// This is valid when the truncated bits of x are already zero.
	// FIXME: We should extend this to work for vectors too.
	SDValue Op;
	KnownBits Known;
	if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) {
	APInt TruncatedBits =
	(Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
	APInt(Op.getValueSizeInBits(), 0) :
	APInt::getBitsSet(Op.getValueSizeInBits(),
	N0.getValueSizeInBits(),
	std::min(Op.getValueSizeInBits(),
	VT.getSizeInBits()));
	if (TruncatedBits.isSubsetOf(Known.Zero))
	return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
	}

	// fold (zext (truncate x)) -> (and x, mask)
	if (N0.getOpcode() == ISD::TRUNCATE) {
	// fold (zext (truncate (load x))) -> (zext (smaller load x))
	// fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	EVT SrcVT = N0.getOperand(0).getValueType();
	EVT MinVT = N0.getValueType();

	// Try to mask before the extension to avoid having to generate a larger mask,
	// possibly over several sub-vectors.
	if (SrcVT.bitsLT(VT)) {
	if (!LegalOperations \|\| (TLI.isOperationLegal(ISD::AND, SrcVT) &&
	TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
	SDValue Op = N0.getOperand(0);
	Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
	AddToWorklist(Op.getNode());
	return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
	}
	}

	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::AND, VT)) {
	SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
	AddToWorklist(Op.getNode());
	SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
	// We may safely transfer the debug info describing the truncate node over
	// to the equivalent and operation.
	DAG.transferDbgValues(N0, And);
	return And;
	}
	}

	// Fold (zext (and (trunc x), cst)) -> (and x, cst),
	// if either of the casts is not free.
	if (N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	(!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
	N0.getValueType()) \|\|
	!TLI.isZExtFree(N0.getValueType(), VT))) {
	SDValue X = N0.getOperand(0).getOperand(0);
	X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.zext(VT.getSizeInBits());
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT,
	X, DAG.getConstant(Mask, DL, VT));
	}

	// fold (zext (load x)) -> (zext (truncate (zextload x)))
	// Only generate vector extloads when 1) they're legal, and 2) they are
	// deemed desirable by the target.
	if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	((!LegalOperations && !VT.isVector() &&
	!cast<LoadSDNode>(N0)->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
	if (VT.isVector())
	DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());

	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (zext (load x)) to multiple smaller zextloads.
	// Only on illegal but splittable vectors.
	if (SDValue ExtLoad = CombineExtLoad(N))
	return ExtLoad;

	// fold (zext (and/or/xor (load x), cst)) ->
	// (and/or/xor (zextload x), (zext cst))
	// Unless (and (load x) cst) will match as a zextload already and has
	// additional users.
	if ((N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
	(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
	if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse()) {
	if (N0.getOpcode() == ISD::AND) {
	auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
	EVT LoadResultTy = AndC->getValueType(0);
	EVT ExtVT;
	if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT))
	DoXform = false;
	}
	if (DoXform)
	DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0),
	ISD::ZERO_EXTEND, SetCCs, TLI);
	}
	if (DoXform) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.zext(VT.getSizeInBits());
	SDLoc DL(N);
	SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
	ExtLoad, DAG.getConstant(Mask, DL, VT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
	SDLoc(N0.getOperand(0)),
	N0.getOperand(0).getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
	bool NoReplaceTruncAnd = !N0.hasOneUse();
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, And);
	// If N0 has multiple uses, change other uses as well.
	if (NoReplaceTruncAnd) {
	SDValue TruncAnd =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
	CombineTo(N0.getNode(), TruncAnd);
	}
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}
	}
	}

	// fold (zext (zextload x)) -> (zext (truncate (zextload x)))
	// fold (zext ( extload x)) -> (zext (truncate (zextload x)))
	if ((ISD::isZEXTLoad(N0.getNode()) \|\| ISD::isEXTLoad(N0.getNode())) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	if ((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), MemVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
	ExtLoad),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	// Only do this before legalize for now.
	if (!LegalOperations && VT.isVector() &&
	N0.getValueType().getVectorElementType() == MVT::i1) {
	EVT N00VT = N0.getOperand(0).getValueType();
	if (getSetCCResultType(N00VT) == N0.getValueType())
	return SDValue();

	// We know that the # elements of the results is the same as the #
	// elements of the compare (and the # elements of the compare result for
	// that matter). Check to see that they are the same size. If so, we know
	// that the element size of the sext'd result matches the element size of
	// the compare operands.
	SDLoc DL(N);
	SDValue VecOnes = DAG.getConstant(1, DL, VT);
	if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
	// zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
	SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
	N0.getOperand(1), N0.getOperand(2));
	return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);
	}

	// If the desired elements are smaller or larger than the source
	// elements we can use a matching integer vector type and then
	// truncate/sign extend.
	EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
	SDValue VsetCC =
	DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
	N0.getOperand(1), N0.getOperand(2));
	return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),
	VecOnes);
	}

	// zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
	SDLoc DL(N);
	if (SDValue SCC = SimplifySelectCC(
	DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT),
	cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
	return SCC;
	}

	// (zext (shl (zext x), cst)) -> (shl (zext x), cst)
	if ((N0.getOpcode() == ISD::SHL \|\| N0.getOpcode() == ISD::SRL) &&
	isa<ConstantSDNode>(N0.getOperand(1)) &&
	N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
	N0.hasOneUse()) {
	SDValue ShAmt = N0.getOperand(1);
	unsigned ShAmtVal = cast<ConstantSDNode>(ShAmt)->getZExtValue();
	if (N0.getOpcode() == ISD::SHL) {
	SDValue InnerZExt = N0.getOperand(0);
	// If the original shl may be shifting out bits, do not perform this
	// transformation.
	unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
	InnerZExt.getOperand(0).getValueSizeInBits();
	if (ShAmtVal > KnownZeroBits)
	return SDValue();
	}

	SDLoc DL(N);

	// Ensure that the shift amount is wide enough for the shifted value.
	if (VT.getSizeInBits() >= 256)
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);

	return DAG.getNode(N0.getOpcode(), DL, VT,
	DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
	ShAmt);
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	// fold (aext (aext x)) -> (aext x)
	// fold (aext (zext x)) -> (zext x)
	// fold (aext (sext x)) -> (sext x)
	if (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));

	// fold (aext (truncate (load x))) -> (aext (smaller load x))
	// fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
	if (N0.getOpcode() == ISD::TRUNCATE) {
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (aext (truncate x))
	if (N0.getOpcode() == ISD::TRUNCATE)
	return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);

	// Fold (aext (and (trunc x), cst)) -> (and x, cst)
	// if the trunc is not free.
	if (N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
	N0.getValueType())) {
	SDLoc DL(N);
	SDValue X = N0.getOperand(0).getOperand(0);
	X = DAG.getAnyExtOrTrunc(X, DL, VT);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.zext(VT.getSizeInBits());
	return DAG.getNode(ISD::AND, DL, VT,
	X, DAG.getConstant(Mask, DL, VT));
	}

	// fold (aext (load x)) -> (aext (truncate (extload x)))
	// None of the supported targets knows how to perform load and any_ext
	// on vectors in one instruction. We only perform this transformation on
	// scalars.
	if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
	ISD::isUNINDEXEDLoad(N0.getNode()) &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
	ISD::ANY_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = N0.hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (aext (zextload x)) -> (aext (truncate (zextload x)))
	// fold (aext (sextload x)) -> (aext (truncate (sextload x)))
	// fold (aext ( extload x)) -> (aext (truncate (extload x)))
	if (N0.getOpcode() == ISD::LOAD &&
	!ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	ISD::LoadExtType ExtType = LN0->getExtensionType();
	EVT MemVT = LN0->getMemoryVT();
	if (!LegalOperations \|\| TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
	VT, LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	// For vectors:
	// aext(setcc) -> vsetcc
	// aext(setcc) -> truncate(vsetcc)
	// aext(setcc) -> aext(vsetcc)
	// Only do this before legalize for now.
	if (VT.isVector() && !LegalOperations) {
	EVT N00VT = N0.getOperand(0).getValueType();
	if (getSetCCResultType(N00VT) == N0.getValueType())
	return SDValue();

	// We know that the # elements of the results is the same as the
	// # elements of the compare (and the # elements of the compare result
	// for that matter). Check to see that they are the same size. If so,
	// we know that the element size of the sext'd result matches the
	// element size of the compare operands.
	if (VT.getSizeInBits() == N00VT.getSizeInBits())
	return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
	N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	// If the desired elements are smaller or larger than the source
	// elements we can use a matching integer vector type and then
	// truncate/any extend
	else {
	EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
	SDValue VsetCC =
	DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
	N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
	}
	}

	// aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
	SDLoc DL(N);
	if (SDValue SCC = SimplifySelectCC(
	DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT),
	cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
	return SCC;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitAssertExt(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT AssertVT = cast<VTSDNode>(N1)->getVT();

	// fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
	if (N0.getOpcode() == Opcode &&
	AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
	return N0;

	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == Opcode) {
	// We have an assert, truncate, assert sandwich. Make one stronger assert
	// by asserting on the smallest asserted type to the larger source type.
	// This eliminates the later assert:
	// assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
	// assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
	SDValue BigA = N0.getOperand(0);
	EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
	assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
	"Asserting zero/sign-extended bits to a type larger than the "
	"truncated destination does not provide information");

	SDLoc DL(N);
	EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
	SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
	SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
	BigA.getOperand(0), MinAssertVTVal);
	return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
	}

	return SDValue();
	}

	/// If the result of a wider load is shifted to right of N bits and then
	/// truncated to a narrower type and where N is a multiple of number of bits of
	/// the narrower type, transform it to a narrower load from address + N / num of
	/// bits of new type. Also narrow the load if the result is masked with an AND
	/// to effectively produce a smaller type. If the result is to be extended, also
	/// fold the extension to form a extending load.
	SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
	unsigned Opc = N->getOpcode();

	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT ExtVT = VT;

	// This transformation isn't valid for vector loads.
	if (VT.isVector())
	return SDValue();

	// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
	// extended to VT.
	if (Opc == ISD::SIGN_EXTEND_INREG) {
	ExtType = ISD::SEXTLOAD;
	ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	} else if (Opc == ISD::SRL) {
	// Another special-case: SRL is basically zero-extending a narrower value,
	// or it maybe shifting a higher subword, half or byte into the lowest
	// bits.
	ExtType = ISD::ZEXTLOAD;
	N0 = SDValue(N, 0);

	auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
	auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N01 \|\| !LN0)
	return SDValue();

	uint64_t ShiftAmt = N01->getZExtValue();
	uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
	if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
	ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
	else
	ExtVT = EVT::getIntegerVT(*DAG.getContext(),
	VT.getSizeInBits() - ShiftAmt);
	} else if (Opc == ISD::AND) {
	// An AND with a constant mask is the same as a truncate + zero-extend.
	auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!AndC \|\| !AndC->getAPIntValue().isMask())
	return SDValue();

	unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
	ExtType = ISD::ZEXTLOAD;
	ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	}

	unsigned ShAmt = 0;
	if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	ShAmt = N01->getZExtValue();
	unsigned EVTBits = ExtVT.getSizeInBits();
	// Is the shift amount a multiple of size of VT?
	if ((ShAmt & (EVTBits-1)) == 0) {
	N0 = N0.getOperand(0);
	// Is the load width a multiple of size of VT?
	if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0)
	return SDValue();
	}

	// At this point, we must have a load or else we can't do the transform.
	if (!isa<LoadSDNode>(N0)) return SDValue();

	// Because a SRL must be assumed to need to zero-extend the high bits
	// (as opposed to anyext the high bits), we can't combine the zextload
	// lowering of SRL and an sextload.
	if (cast<LoadSDNode>(N0)->getExtensionType() == ISD::SEXTLOAD)
	return SDValue();

	// If the shift amount is larger than the input type then we're not
	// accessing any of the loaded bytes. If the load was a zextload/extload
	// then the result of the shift+trunc is zero/undef (handled elsewhere).
	if (ShAmt >= cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits())
	return SDValue();
	}
	}

	// If the load is shifted left (and the result isn't shifted back right),
	// we can fold the truncate through the shift.
	unsigned ShLeftAmt = 0;
	if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
	ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
	if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	ShLeftAmt = N01->getZExtValue();
	N0 = N0.getOperand(0);
	}
	}

	// If we haven't found a load, we can't narrow it.
	if (!isa<LoadSDNode>(N0))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt))
	return SDValue();

	// For big endian targets, we need to adjust the offset to the pointer to
	// load the correct bytes.
	if (DAG.getDataLayout().isBigEndian()) {
	unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
	unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
	ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
	}

	EVT PtrType = N0.getOperand(1).getValueType();
	uint64_t PtrOff = ShAmt / 8;
	unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
	SDLoc DL(LN0);
	// The original load itself didn't wrap, so an offset within it doesn't.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);
	SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
	PtrType, LN0->getBasePtr(),
	DAG.getConstant(PtrOff, DL, PtrType),
	Flags);
	AddToWorklist(NewPtr.getNode());

	SDValue Load;
	if (ExtType == ISD::NON_EXTLOAD)
	Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
	LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
	else
	Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
	LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
	NewAlign, LN0->getMemOperand()->getFlags(),
	LN0->getAAInfo());

	// Replace the old load's chain with the new load's chain.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));

	// Shift the result left, if we've swallowed a left shift.
	SDValue Result = Load;
	if (ShLeftAmt != 0) {
	EVT ShImmTy = getShiftAmountTy(Result.getValueType());
	if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
	ShImmTy = VT;
	// If the shift amount is as large as the result size (but, presumably,
	// no larger than the source) then the useful bits of the result are
	// zero; we can't simply return the shortened shift, because the result
	// of that operation is undefined.
	SDLoc DL(N0);
	if (ShLeftAmt >= VT.getSizeInBits())
	Result = DAG.getConstant(0, DL, VT);
	else
	Result = DAG.getNode(ISD::SHL, DL, VT,
	Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
	}

	// Return the new loaded value.
	return Result;
	}

	SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT EVT = cast<VTSDNode>(N1)->getVT();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned EVTBits = EVT.getScalarSizeInBits();

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// fold (sext_in_reg c1) -> c1
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);

	// If the input is already sign extended, just drop the extension.
	if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
	return N0;

	// fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
	if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	N0.getOperand(0), N1);

	// fold (sext_in_reg (sext x)) -> (sext x)
	// fold (sext_in_reg (aext x)) -> (sext x)
	// if x is small enough.
	if (N0.getOpcode() == ISD::SIGN_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getScalarValueSizeInBits() <= EVTBits &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
	return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
	}

	// fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x)
	if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
	N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
	if (!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
	return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
	}

	// fold (sext_in_reg (zext x)) -> (sext x)
	// iff we are extending the source sign bit.
	if (N0.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getScalarValueSizeInBits() == EVTBits &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
	return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
	}

	// fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
	if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
	return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());

	// fold operands of sext_in_reg based on knowledge that the top bits are not
	// demanded.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (sext_in_reg (load x)) -> (smaller sextload x)
	// fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
	if (SDValue NarrowLoad = ReduceLoadWidth(N))
	return NarrowLoad;

	// fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
	// fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
	// We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
	if (N0.getOpcode() == ISD::SRL) {
	if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
	if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
	// We can turn this into an SRA iff the input to the SRL is already sign
	// extended enough.
	unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
	if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
	return DAG.getNode(ISD::SRA, SDLoc(N), VT,
	N0.getOperand(0), N0.getOperand(1));
	}
	}

	// fold (sext_inreg (extload x)) -> (sextload x)
	// If sextload is not supported by target, we can only do the combine when
	// load has one use. Doing otherwise can block folding the extload with other
	// extends that the target does support.
	if (ISD::isEXTLoad(N0.getNode()) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) &&
	EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
	((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
	N0.hasOneUse()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), EVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	AddToWorklist(ExtLoad.getNode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
	if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse() &&
	EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
	((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), EVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
	if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
	if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
	N0.getOperand(1), false))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	BSwap, N1);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	bool isLE = DAG.getDataLayout().isLittleEndian();

	// noop truncate
	if (N0.getValueType() == N->getValueType(0))
	return N0;

	// fold (truncate (truncate x)) -> (truncate x)
	if (N0.getOpcode() == ISD::TRUNCATE)
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));

	// fold (truncate c1) -> c1
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
	if (C.getNode() != N)
	return C;
	}

	// fold (truncate (ext x)) -> (ext x) or (truncate x) or x
	if (N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND \|\|
	N0.getOpcode() == ISD::ANY_EXTEND) {
	// if the source is smaller than the dest, we still need an extend.
	if (N0.getOperand(0).getValueType().bitsLT(VT))
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
	// if the source is larger than the dest, than we just need the truncate.
	if (N0.getOperand(0).getValueType().bitsGT(VT))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
	// if the source and dest are the same type, we can drop both the extend
	// and the truncate.
	return N0.getOperand(0);
	}

	// If this is anyext(trunc), don't fold it, allow ourselves to be folded.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
	return SDValue();

	// Fold extract-and-trunc into a narrow extract. For example:
	// i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
	// i32 y = TRUNCATE(i64 x)
	// -- becomes --
	// v16i8 b = BITCAST (v2i64 val)
	// i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
	//
	// Note: We only run this optimization after type legalization (which often
	// creates this pattern) and before operation legalization after which
	// we need to be more careful about the vector instructions that we generate.
	if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
	EVT VecTy = N0.getOperand(0).getValueType();
	EVT ExTy = N0.getValueType();
	EVT TrTy = N->getValueType(0);

	unsigned NumElem = VecTy.getVectorNumElements();
	unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();

	EVT NVT = EVT::getVectorVT(DAG.getContext(), TrTy, SizeRatio NumElem);
	assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");

	SDValue EltNo = N0->getOperand(1);
	if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	int Index = isLE ? (EltSizeRatio) : (EltSizeRatio + (SizeRatio-1));

	SDLoc DL(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
	DAG.getBitcast(NVT, N0.getOperand(0)),
	DAG.getConstant(Index, DL, IndexTy));
	}
	}

	// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
	if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
	EVT SrcVT = N0.getValueType();
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
	TLI.isTruncateFree(SrcVT, VT)) {
	SDLoc SL(N0);
	SDValue Cond = N0.getOperand(0);
	SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
	SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
	return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
	}
	}

	// trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
	if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
	TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
	SDValue Amt = N0.getOperand(1);
	KnownBits Known;
	DAG.computeKnownBits(Amt, Known);
	unsigned Size = VT.getScalarSizeInBits();
	if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
	SDLoc SL(N);
	EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
	if (AmtVT != Amt.getValueType()) {
	Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
	AddToWorklist(Amt.getNode());
	}
	return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
	}
	}

	// Fold a series of buildvector, bitcast, and truncate if possible.
	// For example fold
	// (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
	// (2xi32 (buildvector x, y)).
	if (Level == AfterLegalizeVectorOps && VT.isVector() &&
	N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
	N0.getOperand(0).hasOneUse()) {
	SDValue BuildVect = N0.getOperand(0);
	EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
	EVT TruncVecEltTy = VT.getVectorElementType();

	// Check that the element types match.
	if (BuildVectEltTy == TruncVecEltTy) {
	// Now we only need to compute the offset of the truncated elements.
	unsigned BuildVecNumElts = BuildVect.getNumOperands();
	unsigned TruncVecNumElts = VT.getVectorNumElements();
	unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;

	assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
	"Invalid number of elements");

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
	Opnds.push_back(BuildVect.getOperand(i));

	return DAG.getBuildVector(VT, SDLoc(N), Opnds);
	}
	}

	// See if we can simplify the input to this truncate through knowledge that
	// only the low bits are being used.
	// For example "trunc (or (shl x, 8), y)" // -> trunc y
	// Currently we only perform this optimization on scalars because vectors
	// may have different active low bits.
	if (!VT.isVector()) {
	APInt Mask =
	APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits());
	if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
	}

	// fold (truncate (load x)) -> (smaller load x)
	// fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
	if (!LegalTypes \|\| TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
	if (SDValue Reduced = ReduceLoadWidth(N))
	return Reduced;

	// Handle the case where the load remains an extending load even
	// after truncation.
	if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	if (!LN0->isVolatile() &&
	LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
	SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
	VT, LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
	return NewLoad;
	}
	}
	}

	// fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
	// where ... are all 'undef'.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
	SmallVector<EVT, 8> VTs;
	SDValue V;
	unsigned Idx = 0;
	unsigned NumDefs = 0;

	for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
	SDValue X = N0.getOperand(i);
	if (!X.isUndef()) {
	V = X;
	Idx = i;
	NumDefs++;
	}
	// Stop if more than one members are non-undef.
	if (NumDefs > 1)
	break;
	VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	X.getValueType().getVectorNumElements()));
	}

	if (NumDefs == 0)
	return DAG.getUNDEF(VT);

	if (NumDefs == 1) {
	assert(V.getNode() && "The single defined operand is empty!");
	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
	if (i != Idx) {
	Opnds.push_back(DAG.getUNDEF(VTs[i]));
	continue;
	}
	SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
	AddToWorklist(NV.getNode());
	Opnds.push_back(NV);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
	}
	}

	// Fold truncate of a bitcast of a vector to an extract of the low vector
	// element.
	//
	// e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
	if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
	SDValue VecSrc = N0.getOperand(0);
	EVT SrcVT = VecSrc.getValueType();
	if (SrcVT.isVector() && SrcVT.getScalarType() == VT &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) {
	SDLoc SL(N);

	EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
	unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
	VecSrc, DAG.getConstant(Idx, SL, IdxVT));
	}
	}

	// Simplify the operands using demanded-bits information.
	if (!VT.isVector() &&
	SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
	// (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
	// When the adde's carry is not used.
	if ((N0.getOpcode() == ISD::ADDE \|\| N0.getOpcode() == ISD::ADDCARRY) &&
	N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
	(!LegalOperations \|\| TLI.isOperationLegal(N0.getOpcode(), VT))) {
	SDLoc SL(N);
	auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
	auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
	auto VTs = DAG.getVTList(VT, N0->getValueType(1));
	return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	static SDNode getBuildPairElt(SDNode N, unsigned i) {
	SDValue Elt = N->getOperand(i);
	if (Elt.getOpcode() != ISD::MERGE_VALUES)
	return Elt.getNode();
	return Elt.getOperand(Elt.getResNo()).getNode();
	}

	/// build_pair (load, load) -> load
	/// if load locations are consecutive.
	SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
	assert(N->getOpcode() == ISD::BUILD_PAIR);

	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
	LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));

	// A BUILD_PAIR is always having the least significant part in elt 0 and the
	// most significant part in elt 1. So when combining into one large load, we
	// need to consider the endianness.
	if (DAG.getDataLayout().isBigEndian())
	std::swap(LD1, LD2);

	if (!LD1 \|\| !LD2 \|\| !ISD::isNON_EXTLoad(LD1) \|\| !LD1->hasOneUse() \|\|
	LD1->getAddressSpace() != LD2->getAddressSpace())
	return SDValue();
	EVT LD1VT = LD1->getValueType(0);
	unsigned LD1Bytes = LD1VT.getStoreSize();
	if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
	DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
	unsigned Align = LD1->getAlignment();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	VT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign <= Align &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::LOAD, VT)))
	return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
	LD1->getPointerInfo(), Align);
	}

	return SDValue();
	}

	static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
	// On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
	// and Lo parts; on big-endian machines it doesn't.
	return DAG.getDataLayout().isBigEndian() ? 1 : 0;
	}

	static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// If this is not a bitcast to an FP type or if the target doesn't have
	// IEEE754-compliant FP logic, we're done.
	EVT VT = N->getValueType(0);
	if (!VT.isFloatingPoint() \|\| !TLI.hasBitPreservingFPLogic(VT))
	return SDValue();

	// TODO: Use splat values for the constant-checking below and remove this
	// restriction.
	SDValue N0 = N->getOperand(0);
	EVT SourceVT = N0.getValueType();
	if (SourceVT.isVector())
	return SDValue();

	unsigned FPOpcode;
	APInt SignMask;
	switch (N0.getOpcode()) {
	case ISD::AND:
	FPOpcode = ISD::FABS;
	SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits());
	break;
	case ISD::XOR:
	FPOpcode = ISD::FNEG;
	SignMask = APInt::getSignMask(SourceVT.getSizeInBits());
	break;
	// TODO: ISD::OR --> ISD::FNABS?
	default:
	return SDValue();
	}

	// Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
	// Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
	SDValue LogicOp0 = N0.getOperand(0);
	ConstantSDNode *LogicOp1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
	LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0->getOperand(0).getValueType() == VT)
	return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitBITCAST(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// If the input is a BUILD_VECTOR with all constant elements, fold this now.
	// Only do this before legalize, since afterward the target may be depending
	// on the bitconvert.
	// First check to see if this is all constant.
	if (!LegalTypes &&
	N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
	VT.isVector()) {
	bool isSimple = cast<BuildVectorSDNode>(N0)->isConstant();

	EVT DestEltVT = N->getValueType(0).getVectorElementType();
	assert(!DestEltVT.isVector() &&
	"Element type of vector ValueType must not be vector!");
	if (isSimple)
	return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT);
	}

	// If the input is a constant, let getNode fold it.
	if (isa<ConstantSDNode>(N0) \|\| isa<ConstantFPSDNode>(N0)) {
	// If we can't allow illegal operations, we need to check that this is just
	// a fp -> int or int -> conversion and that the resulting operation will
	// be legal.
	if (!LegalOperations \|\|
	(isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
	TLI.isOperationLegal(ISD::ConstantFP, VT)) \|\|
	(isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
	TLI.isOperationLegal(ISD::Constant, VT)))
	return DAG.getBitcast(VT, N0);
	}

	// (conv (conv x, t1), t2) -> (conv x, t2)
	if (N0.getOpcode() == ISD::BITCAST)
	return DAG.getBitcast(VT, N0.getOperand(0));

	// fold (conv (load x)) -> (load (conv*)x)
	// If the resultant load doesn't need a higher alignment than the original!
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile() &&
	// Do not remove the cast if the types differ in endian layout.
	TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
	TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::LOAD, VT)) &&
	TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	unsigned OrigAlign = LN0->getAlignment();

	bool Fast = false;
	if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	LN0->getAddressSpace(), OrigAlign, &Fast) &&
	Fast) {
	SDValue Load =
	DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), OrigAlign,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
	return Load;
	}
	}

	if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
	return V;

	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	//
	// For ppc_fp128:
	// fold (bitcast (fneg x)) ->
	// flipbit = signbit
	// (xor (bitcast x) (build_pair flipbit, flipbit))
	//
	// fold (bitcast (fabs x)) ->
	// flipbit = (and (extract_element (bitcast x), 0), signbit)
	// (xor (bitcast x) (build_pair flipbit, flipbit))
	// This often reduces constant pool loads.
	if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) \|\|
	(N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
	N0.getNode()->hasOneUse() && VT.isInteger() &&
	!VT.isVector() && !N0.getValueType().isVector()) {
	SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
	AddToWorklist(NewConv.getNode());

	SDLoc DL(N);
	if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
	assert(VT.getSizeInBits() == 128);
	SDValue SignBit = DAG.getConstant(
	APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
	SDValue FlipBit;
	if (N0.getOpcode() == ISD::FNEG) {
	FlipBit = SignBit;
	AddToWorklist(FlipBit.getNode());
	} else {
	assert(N0.getOpcode() == ISD::FABS);
	SDValue Hi =
	DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
	DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
	SDLoc(NewConv)));
	AddToWorklist(Hi.getNode());
	FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
	AddToWorklist(FlipBit.getNode());
	}
	SDValue FlipBits =
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
	AddToWorklist(FlipBits.getNode());
	return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
	}
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
	if (N0.getOpcode() == ISD::FNEG)
	return DAG.getNode(ISD::XOR, DL, VT,
	NewConv, DAG.getConstant(SignBit, DL, VT));
	assert(N0.getOpcode() == ISD::FABS);
	return DAG.getNode(ISD::AND, DL, VT,
	NewConv, DAG.getConstant(~SignBit, DL, VT));
	}

	// fold (bitconvert (fcopysign cst, x)) ->
	// (or (and (bitconvert x), sign), (and cst, (not sign)))
	// Note that we don't handle (copysign x, cst) because this can always be
	// folded to an fneg or fabs.
	//
	// For ppc_fp128:
	// fold (bitcast (fcopysign cst, x)) ->
	// flipbit = (and (extract_element
	// (xor (bitcast cst), (bitcast x)), 0),
	// signbit)
	// (xor (bitcast cst) (build_pair flipbit, flipbit))
	if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
	isa<ConstantFPSDNode>(N0.getOperand(0)) &&
	VT.isInteger() && !VT.isVector()) {
	unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
	EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
	if (isTypeLegal(IntXVT)) {
	SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
	AddToWorklist(X.getNode());

	// If X has a different width than the result/lhs, sext it or truncate it.
	unsigned VTWidth = VT.getSizeInBits();
	if (OrigXWidth < VTWidth) {
	X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
	AddToWorklist(X.getNode());
	} else if (OrigXWidth > VTWidth) {
	// To get the sign bit in the right place, we have to shift it right
	// before truncating.
	SDLoc DL(X);
	X = DAG.getNode(ISD::SRL, DL,
	X.getValueType(), X,
	DAG.getConstant(OrigXWidth-VTWidth, DL,
	X.getValueType()));
	AddToWorklist(X.getNode());
	X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
	AddToWorklist(X.getNode());
	}

	if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
	SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
	AddToWorklist(Cst.getNode());
	SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
	AddToWorklist(X.getNode());
	SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
	AddToWorklist(XorResult.getNode());
	SDValue XorResult64 = DAG.getNode(
	ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
	DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
	SDLoc(XorResult)));
	AddToWorklist(XorResult64.getNode());
	SDValue FlipBit =
	DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
	DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
	AddToWorklist(FlipBit.getNode());
	SDValue FlipBits =
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
	AddToWorklist(FlipBits.getNode());
	return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
	}
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
	X = DAG.getNode(ISD::AND, SDLoc(X), VT,
	X, DAG.getConstant(SignBit, SDLoc(X), VT));
	AddToWorklist(X.getNode());

	SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
	Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
	Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
	AddToWorklist(Cst.getNode());

	return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
	}
	}

	// bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
	if (N0.getOpcode() == ISD::BUILD_PAIR)
	if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
	return CombineLD;

	// Remove double bitcasts from shuffles - this is often a legacy of
	// XformToShuffleWithZero being used to combine bitmaskings (of
	// float vectors bitcast to integer vectors) into shuffles.
	// bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
	if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
	N0->getOpcode() == ISD::VECTOR_SHUFFLE &&
	VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
	!(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);

	// If operands are a bitcast, peek through if it casts the original VT.
	// If operands are a constant, just bitcast back to original VT.
	auto PeekThroughBitcast = [&](SDValue Op) {
	if (Op.getOpcode() == ISD::BITCAST &&
	Op.getOperand(0).getValueType() == VT)
	return SDValue(Op.getOperand(0));
	if (Op.isUndef() \|\| ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
	return DAG.getBitcast(VT, Op);
	return SDValue();
	};

	// FIXME: If either input vector is bitcast, try to convert the shuffle to
	// the result type of this bitcast. This would eliminate at least one
	// bitcast. See the transform in InstCombine.
	SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
	SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
	if (!(SV0 && SV1))
	return SDValue();

	int MaskScale =
	VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
	SmallVector<int, 8> NewMask;
	for (int M : SVN->getMask())
	for (int i = 0; i != MaskScale; ++i)
	NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);

	bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
	if (!LegalMask) {
	std::swap(SV0, SV1);
	ShuffleVectorSDNode::commuteMask(NewMask);
	LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
	}

	if (LegalMask)
	return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
	EVT VT = N->getValueType(0);
	return CombineConsecutiveLoads(N, VT);
	}

	/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
	/// operands. DstEltVT indicates the destination element value type.
	SDValue DAGCombiner::
	ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
	EVT SrcEltVT = BV->getValueType(0).getVectorElementType();

	// If this is already the right type, we're done.
	if (SrcEltVT == DstEltVT) return SDValue(BV, 0);

	unsigned SrcBitSize = SrcEltVT.getSizeInBits();
	unsigned DstBitSize = DstEltVT.getSizeInBits();

	// If this is a conversion of N elements of one type to N elements of another
	// type, convert each element. This handles FP<->INT cases.
	if (SrcBitSize == DstBitSize) {
	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
	BV->getValueType(0).getVectorNumElements());

	// Due to the FP element handling below calling this routine recursively,
	// we can end up with a scalar-to-vector node here.
	if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
	DAG.getBitcast(DstEltVT, BV->getOperand(0)));

	SmallVector<SDValue, 8> Ops;
	for (SDValue Op : BV->op_values()) {
	// If the vector element type is not legal, the BUILD_VECTOR operands
	// are promoted and implicitly truncated. Make that explicit here.
	if (Op.getValueType() != SrcEltVT)
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
	Ops.push_back(DAG.getBitcast(DstEltVT, Op));
	AddToWorklist(Ops.back().getNode());
	}
	return DAG.getBuildVector(VT, SDLoc(BV), Ops);
	}

	// Otherwise, we're growing or shrinking the elements. To avoid having to
	// handle annoying details of growing/shrinking FP values, we convert them to
	// int first.
	if (SrcEltVT.isFloatingPoint()) {
	// Convert the input float vector to a int vector where the elements are the
	// same sizes.
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
	BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
	SrcEltVT = IntVT;
	}

	// Now we know the input is an integer vector. If the output is a FP type,
	// convert to integer first, then to FP of the right size.
	if (DstEltVT.isFloatingPoint()) {
	EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
	SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();

	// Next, convert to FP elements of the same size.
	return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
	}

	SDLoc DL(BV);

	// Okay, we know the src/dst types are both integers of differing types.
	// Handling growing first.
	assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
	if (SrcBitSize < DstBitSize) {
	unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = BV->getNumOperands(); i != e;
	i += NumInputsPerOutput) {
	bool isLE = DAG.getDataLayout().isLittleEndian();
	APInt NewBits = APInt(DstBitSize, 0);
	bool EltIsUndef = true;
	for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
	// Shift the previously computed bits over.
	NewBits <<= SrcBitSize;
	SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
	if (Op.isUndef()) continue;
	EltIsUndef = false;

	NewBits \|= cast<ConstantSDNode>(Op)->getAPIntValue().
	zextOrTrunc(SrcBitSize).zext(DstBitSize);
	}

	if (EltIsUndef)
	Ops.push_back(DAG.getUNDEF(DstEltVT));
	else
	Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
	}

	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
	return DAG.getBuildVector(VT, DL, Ops);
	}

	// Finally, this must be the case where we are shrinking elements: each input
	// turns into multiple outputs.
	unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
	NumOutputsPerInput*BV->getNumOperands());
	SmallVector<SDValue, 8> Ops;

	for (const SDValue &Op : BV->op_values()) {
	if (Op.isUndef()) {
	Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
	continue;
	}

	APInt OpVal = cast<ConstantSDNode>(Op)->
	getAPIntValue().zextOrTrunc(SrcBitSize);

	for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
	APInt ThisVal = OpVal.trunc(DstBitSize);
	Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
	OpVal.lshrInPlace(DstBitSize);
	}

	// For big endian targets, swap the order of the pieces of each element.
	if (DAG.getDataLayout().isBigEndian())
	std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
	}

	return DAG.getBuildVector(VT, DL, Ops);
	}

	static bool isContractable(SDNode *N) {
	SDNodeFlags F = N->getFlags();
	return F.hasAllowContract() \|\| F.hasUnsafeAlgebra();
	}

	/// Try to perform FMA combining on a given FADD node.
	SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	const TargetOptions &Options = DAG.getTarget().Options;

	// Floating-point multiply-add with intermediate rounding.
	bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	TLI.isFMAFasterThanFMulAndFAdd(VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath \|\| HasFMAD);
	// If the addition is not contractable, do not combine.
	if (!AllowFusionGlobally && !isContractable(N))
	return SDValue();

	const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
	if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// Is the node an FMUL and contractable either due to global flags or
	// SDNodeFlags.
	auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	return AllowFusionGlobally \|\| isContractable(N.getNode());
	};
	// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
	// prefer to fold the multiply with fewer uses.
	if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
	if (N0.getNode()->use_size() > N1.getNode()->use_size())
	std::swap(N0, N1);
	}

	// fold (fadd (fmul x, y), z) -> (fma x, y, z)
	if (isContractableFMUL(N0) && (Aggressive \|\| N0->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1), N1);
	}

	// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
	// Note: Commutes FADD operands.
	if (isContractableFMUL(N1) && (Aggressive \|\| N1->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(0), N1.getOperand(1), N0);
	}

	// Look through FP_EXTEND nodes to do more combining.

	// fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (isContractableFMUL(N00) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)), N1);
	}
	}

	// fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
	// Note: Commutes FADD operands.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (isContractableFMUL(N10) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(1)), N0);
	}
	}

	// More folding opportunities when target permits.
	if (Aggressive) {
	// fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath &&
	N0.getOpcode() == PreferredFusedOpcode &&
	N0.getOperand(2).getOpcode() == ISD::FMUL &&
	N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(2).getOperand(0),
	N0.getOperand(2).getOperand(1),
	N1));
	}

	// fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath &&
	N1->getOpcode() == PreferredFusedOpcode &&
	N1.getOperand(2).getOpcode() == ISD::FMUL &&
	N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(0), N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(2).getOperand(0),
	N1.getOperand(2).getOperand(1),
	N0));
	}


	// fold (fadd (fma x, y, (fpext (fmul u, v))), z)
	// -> (fma x, y, (fma (fpext u), (fpext v), z))
	auto FoldFAddFMAFPExtFMul = [&] (
	SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
	Z));
	};
	if (N0.getOpcode() == PreferredFusedOpcode) {
	SDValue N02 = N0.getOperand(2);
	if (N02.getOpcode() == ISD::FP_EXTEND) {
	SDValue N020 = N02.getOperand(0);
	if (isContractableFMUL(N020) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
	return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
	N020.getOperand(0), N020.getOperand(1),
	N1);
	}
	}
	}

	// fold (fadd (fpext (fma x, y, (fmul u, v))), z)
	// -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	auto FoldFAddFPExtFMAFMul = [&] (
	SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
	Z));
	};
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == PreferredFusedOpcode) {
	SDValue N002 = N00.getOperand(2);
	if (isContractableFMUL(N002) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
	N002.getOperand(0), N002.getOperand(1),
	N1);
	}
	}
	}

	// fold (fadd x, (fma y, z, (fpext (fmul u, v)))
	// -> (fma y, z, (fma (fpext u), (fpext v), x))
	if (N1.getOpcode() == PreferredFusedOpcode) {
	SDValue N12 = N1.getOperand(2);
	if (N12.getOpcode() == ISD::FP_EXTEND) {
	SDValue N120 = N12.getOperand(0);
	if (isContractableFMUL(N120) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
	return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
	N120.getOperand(0), N120.getOperand(1),
	N0);
	}
	}
	}

	// fold (fadd x, (fpext (fma y, z, (fmul u, v)))
	// -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (N10.getOpcode() == PreferredFusedOpcode) {
	SDValue N102 = N10.getOperand(2);
	if (isContractableFMUL(N102) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
	return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
	N102.getOperand(0), N102.getOperand(1),
	N0);
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to perform FMA combining on a given FSUB node.
	SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	const TargetOptions &Options = DAG.getTarget().Options;
	// Floating-point multiply-add with intermediate rounding.
	bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	TLI.isFMAFasterThanFMulAndFAdd(VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath \|\| HasFMAD);
	// If the subtraction is not contractable, do not combine.
	if (!AllowFusionGlobally && !isContractable(N))
	return SDValue();

	const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
	if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// Is the node an FMUL and contractable either due to global flags or
	// SDNodeFlags.
	auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	return AllowFusionGlobally \|\| isContractable(N.getNode());
	};

	// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
	if (isContractableFMUL(N0) && (Aggressive \|\| N0->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(ISD::FNEG, SL, VT, N1));
	}

	// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
	// Note: Commutes FSUB operands.
	if (isContractableFMUL(N1) && (Aggressive \|\| N1->hasOneUse()))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	N1.getOperand(0)),
	N1.getOperand(1), N0);

	// fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
	if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
	(Aggressive \|\| (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
	SDValue N00 = N0.getOperand(0).getOperand(0);
	SDValue N01 = N0.getOperand(0).getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
	DAG.getNode(ISD::FNEG, SL, VT, N1));
	}

	// Look through FP_EXTEND nodes to do more combining.

	// fold (fsub (fpext (fmul x, y)), z)
	// -> (fma (fpext x), (fpext y), (fneg z))
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (isContractableFMUL(N00) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT, N1));
	}
	}

	// fold (fsub x, (fpext (fmul y, z)))
	// -> (fma (fneg (fpext y)), (fpext z), x)
	// Note: Commutes FSUB operands.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (isContractableFMUL(N10) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(0))),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(1)),
	N0);
	}
	}

	// fold (fsub (fpext (fneg (fmul, x, y))), z)
	// -> (fneg (fma (fpext x), (fpext y), z))
	// Note: This could be removed with appropriate canonicalization of the
	// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
	// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
	// from implementing the canonicalization in visitFSUB.
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::FNEG) {
	SDValue N000 = N00.getOperand(0);
	if (isContractableFMUL(N000) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(1)),
	N1));
	}
	}
	}

	// fold (fsub (fneg (fpext (fmul, x, y))), z)
	// -> (fneg (fma (fpext x)), (fpext y), z)
	// Note: This could be removed with appropriate canonicalization of the
	// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
	// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
	// from implementing the canonicalization in visitFSUB.
	if (N0.getOpcode() == ISD::FNEG) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::FP_EXTEND) {
	SDValue N000 = N00.getOperand(0);
	if (isContractableFMUL(N000) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) {
	return DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(1)),
	N1));
	}
	}
	}

	// More folding opportunities when target permits.
	if (Aggressive) {
	// fold (fsub (fma x, y, (fmul u, v)), z)
	// -> (fma x, y (fma u, v, (fneg z)))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode &&
	isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
	N0.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(2).getOperand(0),
	N0.getOperand(2).getOperand(1),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1)));
	}

	// fold (fsub x, (fma y, z, (fmul u, v)))
	// -> (fma (fneg y), z, (fma (fneg u), v, x))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode &&
	isContractableFMUL(N1.getOperand(2))) {
	SDValue N20 = N1.getOperand(2).getOperand(0);
	SDValue N21 = N1.getOperand(2).getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	N1.getOperand(0)),
	N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N20),

	N21, N0));
	}


	// fold (fsub (fma x, y, (fpext (fmul u, v))), z)
	// -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
	if (N0.getOpcode() == PreferredFusedOpcode) {
	SDValue N02 = N0.getOperand(2);
	if (N02.getOpcode() == ISD::FP_EXTEND) {
	SDValue N020 = N02.getOperand(0);
	if (isContractableFMUL(N020) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N020.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N020.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1)));
	}
	}
	}

	// fold (fsub (fpext (fma x, y, (fmul u, v))), z)
	// -> (fma (fpext x), (fpext y),
	// (fma (fpext u), (fpext v), (fneg z)))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == PreferredFusedOpcode) {
	SDValue N002 = N00.getOperand(2);
	if (isContractableFMUL(N002) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N002.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N002.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1)));
	}
	}
	}

	// fold (fsub x, (fma y, z, (fpext (fmul u, v))))
	// -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
	if (N1.getOpcode() == PreferredFusedOpcode &&
	N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
	SDValue N120 = N1.getOperand(2).getOperand(0);
	if (isContractableFMUL(N120) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
	SDValue N1200 = N120.getOperand(0);
	SDValue N1201 = N120.getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
	N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL,
	VT, N1200)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N1201),
	N0));
	}
	}

	// fold (fsub x, (fpext (fma y, z, (fmul u, v))))
	// -> (fma (fneg (fpext y)), (fpext z),
	// (fma (fneg (fpext u)), (fpext v), x))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N1.getOpcode() == ISD::FP_EXTEND &&
	N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
	SDValue CvtSrc = N1.getOperand(0);
	SDValue N100 = CvtSrc.getOperand(0);
	SDValue N101 = CvtSrc.getOperand(1);
	SDValue N102 = CvtSrc.getOperand(2);
	if (isContractableFMUL(N102) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) {
	SDValue N1020 = N102.getOperand(0);
	SDValue N1021 = N102.getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N100)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL,
	VT, N1020)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N1021),
	N0));
	}
	}
	}

	return SDValue();
	}

	/// Try to perform FMA combining on a given FMUL node based on the distributive
	/// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
	/// subtraction instead of addition).
	SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");

	const TargetOptions &Options = DAG.getTarget().Options;

	// The transforms below are incorrect when x == 0 and y == inf, because the
	// intermediate multiplication produces a nan.
	if (!Options.NoInfsFPMath)
	return SDValue();

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath) &&
	TLI.isFMAFasterThanFMulAndFAdd(VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// Floating-point multiply-add with intermediate rounding. This can result
	// in a less precise result due to the changed rounding order.
	bool HasFMAD = Options.UnsafeFPMath &&
	(LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
	// fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
	auto FuseFADD = [&](SDValue X, SDValue Y) {
	if (X.getOpcode() == ISD::FADD && (Aggressive \|\| X->hasOneUse())) {
	auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
	if (XC1 && XC1->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
	if (XC1 && XC1->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y));
	}
	return SDValue();
	};

	if (SDValue FMA = FuseFADD(N0, N1))
	return FMA;
	if (SDValue FMA = FuseFADD(N1, N0))
	return FMA;

	// fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
	// fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
	// fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
	// fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
	auto FuseFSUB = [&](SDValue X, SDValue Y) {
	if (X.getOpcode() == ISD::FSUB && (Aggressive \|\| X->hasOneUse())) {
	auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
	if (XC0 && XC0->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
	Y);
	if (XC0 && XC0->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y));

	auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
	if (XC1 && XC1->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y));
	if (XC1 && XC1->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
	}
	return SDValue();
	};

	if (SDValue FMA = FuseFSUB(N0, N1))
	return FMA;
	if (SDValue FMA = FuseFSUB(N1, N0))
	return FMA;

	return SDValue();
	}

	static bool isFMulNegTwo(SDValue &N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1)))
	return CFP->isExactlyValue(-2.0);
	return false;
	}

	SDValue DAGCombiner::visitFADD(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
	bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fadd c1, c2) -> c1 + c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);

	// canonicalize constant to RHS
	if (N0CFP && !N1CFP)
	return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (fadd A, (fneg B)) -> (fsub A, B)
	if ((!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
	isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
	return DAG.getNode(ISD::FSUB, DL, VT, N0,
	GetNegatedExpression(N1, DAG, LegalOperations), Flags);

	// fold (fadd (fneg A), B) -> (fsub B, A)
	if ((!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
	isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
	return DAG.getNode(ISD::FSUB, DL, VT, N1,
	GetNegatedExpression(N0, DAG, LegalOperations), Flags);

	// fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B))
	// fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B))
	if ((isFMulNegTwo(N0) && N0.hasOneUse()) \|\|
	(isFMulNegTwo(N1) && N1.hasOneUse())) {
	bool N1IsFMul = isFMulNegTwo(N1);
	SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0);
	SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags);
	return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
	}

	// FIXME: Auto-upgrade the target/function-level option.
	if (Options.NoSignedZerosFPMath \|\| N->getFlags().hasNoSignedZeros()) {
	// fold (fadd A, 0) -> A
	if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
	if (N1C->isZero())
	return N0;
	}

	// If 'unsafe math' is enabled, fold lots of things.
	if (Options.UnsafeFPMath) {
	// No FP constant should be created after legalization as Instruction
	// Selection pass has a hard time dealing with FP constants.
	bool AllowNewConst = (Level < AfterLegalizeDAG);

	// fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
	if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
	isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)))
	return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0),
	DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1,
	Flags),
	Flags);

	// If allowed, fold (fadd (fneg x), x) -> 0.0
	if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
	return DAG.getConstantFP(0.0, DL, VT);

	// If allowed, fold (fadd x, (fneg x)) -> 0.0
	if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
	return DAG.getConstantFP(0.0, DL, VT);

	// We can fold chains of FADD's of the same value into multiplications.
	// This transform is not safe in general because we are reducing the number
	// of rounding steps.
	if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
	if (N0.getOpcode() == ISD::FMUL) {
	bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
	bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));

	// (fadd (fmul x, c), x) -> (fmul x, c+1)
	if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
	}

	// (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
	if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
	N1.getOperand(0) == N1.getOperand(1) &&
	N0.getOperand(0) == N1.getOperand(0)) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
	DAG.getConstantFP(2.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
	}
	}

	if (N1.getOpcode() == ISD::FMUL) {
	bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
	bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));

	// (fadd x, (fmul x, c)) -> (fmul x, c+1)
	if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
	}

	// (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
	if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
	N0.getOperand(0) == N0.getOperand(1) &&
	N1.getOperand(0) == N0.getOperand(0)) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
	DAG.getConstantFP(2.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
	}
	}

	if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
	bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
	// (fadd (fadd x, x), x) -> (fmul x, 3.0)
	if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
	(N0.getOperand(0) == N1)) {
	return DAG.getNode(ISD::FMUL, DL, VT,
	N1, DAG.getConstantFP(3.0, DL, VT), Flags);
	}
	}

	if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
	bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
	// (fadd x, (fadd x, x)) -> (fmul x, 3.0)
	if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
	N1.getOperand(0) == N0) {
	return DAG.getNode(ISD::FMUL, DL, VT,
	N0, DAG.getConstantFP(3.0, DL, VT), Flags);
	}
	}

	// (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
	if (AllowNewConst &&
	N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
	N0.getOperand(0) == N0.getOperand(1) &&
	N1.getOperand(0) == N1.getOperand(1) &&
	N0.getOperand(0) == N1.getOperand(0)) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
	DAG.getConstantFP(4.0, DL, VT), Flags);
	}
	}
	} // enable-unsafe-fp-math

	// FADD -> FMA combines:
	if (SDValue Fused = visitFADDForFMACombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitFSUB(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fsub c1, c2) -> c1-c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (fsub A, (fneg B)) -> (fadd A, B)
	if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
	return DAG.getNode(ISD::FADD, DL, VT, N0,
	GetNegatedExpression(N1, DAG, LegalOperations), Flags);

	// FIXME: Auto-upgrade the target/function-level option.
	if (Options.NoSignedZerosFPMath \|\| N->getFlags().hasNoSignedZeros()) {
	// (fsub 0, B) -> -B
	if (N0CFP && N0CFP->isZero()) {
	if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
	return GetNegatedExpression(N1, DAG, LegalOperations);
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
	}
	}

	// If 'unsafe math' is enabled, fold lots of things.
	if (Options.UnsafeFPMath) {
	// (fsub A, 0) -> A
	if (N1CFP && N1CFP->isZero())
	return N0;

	// (fsub x, x) -> 0.0
	if (N0 == N1)
	return DAG.getConstantFP(0.0f, DL, VT);

	// (fsub x, (fadd x, y)) -> (fneg y)
	// (fsub x, (fadd y, x)) -> (fneg y)
	if (N1.getOpcode() == ISD::FADD) {
	SDValue N10 = N1->getOperand(0);
	SDValue N11 = N1->getOperand(1);

	if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options))
	return GetNegatedExpression(N11, DAG, LegalOperations);

	if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options))
	return GetNegatedExpression(N10, DAG, LegalOperations);
	}
	}

	// FSUB -> FMA combines:
	if (SDValue Fused = visitFSUBForFMACombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMUL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector()) {
	// This just handles C1 * C2 for vectors. Other vector folds are below.
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;
	}

	// fold (fmul c1, c2) -> c1*c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);

	// canonicalize constant to RHS
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);

	// fold (fmul A, 1.0) -> A
	if (N1CFP && N1CFP->isExactlyValue(1.0))
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (Options.UnsafeFPMath) {
	// fold (fmul A, 0) -> 0
	if (N1CFP && N1CFP->isZero())
	return N1;

	// fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
	if (N0.getOpcode() == ISD::FMUL) {
	// Fold scalars or any vector constants (not just splats).
	// This fold is done in general by InstCombine, but extra fmul insts
	// may have been generated during lowering.
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
	auto *BV00 = dyn_cast<BuildVectorSDNode>(N00);
	auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);

	// Check 1: Make sure that the first operand of the inner multiply is NOT
	// a constant. Otherwise, we may induce infinite looping.
	if (!(isConstOrConstSplatFP(N00) \|\| (BV00 && BV00->isConstant()))) {
	// Check 2: Make sure that the second operand of the inner multiply and
	// the second operand of the outer multiply are constants.
	if ((N1CFP && isConstOrConstSplatFP(N01)) \|\|
	(BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
	SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
	}
	}
	}

	// fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
	// Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
	// during an early run of DAGCombiner can prevent folding with fmuls
	// inserted during lowering.
	if (N0.getOpcode() == ISD::FADD &&
	(N0.getOperand(0) == N0.getOperand(1)) &&
	N0.hasOneUse()) {
	const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
	SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
	}
	}

	// fold (fmul X, 2.0) -> (fadd X, X)
	if (N1CFP && N1CFP->isExactlyValue(+2.0))
	return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);

	// fold (fmul X, -1.0) -> (fneg X)
	if (N1CFP && N1CFP->isExactlyValue(-1.0))
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT, N0);

	// fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
	if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
	if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
	// Both can be negated for free, check to see if at least one is cheaper
	// negated.
	if (LHSNeg == 2 \|\| RHSNeg == 2)
	return DAG.getNode(ISD::FMUL, DL, VT,
	GetNegatedExpression(N0, DAG, LegalOperations),
	GetNegatedExpression(N1, DAG, LegalOperations),
	Flags);
	}
	}

	// fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
	// fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
	if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
	(N0.getOpcode() == ISD::SELECT \|\| N1.getOpcode() == ISD::SELECT) &&
	TLI.isOperationLegal(ISD::FABS, VT)) {
	SDValue Select = N0, X = N1;
	if (Select.getOpcode() != ISD::SELECT)
	std::swap(Select, X);

	SDValue Cond = Select.getOperand(0);
	auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
	auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));

	if (TrueOpnd && FalseOpnd &&
	Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
	isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
	cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETULT:
	case ISD::SETOLE:
	case ISD::SETULE:
	case ISD::SETLT:
	case ISD::SETLE:
	std::swap(TrueOpnd, FalseOpnd);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETUGT:
	case ISD::SETOGE:
	case ISD::SETUGE:
	case ISD::SETGT:
	case ISD::SETGE:
	if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
	TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT,
	DAG.getNode(ISD::FABS, DL, VT, X));
	if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
	return DAG.getNode(ISD::FABS, DL, VT, X);

	break;
	}
	}
	}

	// FMUL -> FMA combines:
	if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMA(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;

	// Constant fold FMA.
	if (isa<ConstantFPSDNode>(N0) &&
	isa<ConstantFPSDNode>(N1) &&
	isa<ConstantFPSDNode>(N2)) {
	return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
	}

	if (Options.UnsafeFPMath) {
	if (N0CFP && N0CFP->isZero())
	return N2;
	if (N1CFP && N1CFP->isZero())
	return N2;
	}
	// TODO: The FMA node should have flags that propagate to these nodes.
	if (N0CFP && N0CFP->isExactlyValue(1.0))
	return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
	if (N1CFP && N1CFP->isExactlyValue(1.0))
	return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);

	// Canonicalize (fma c, x, y) -> (fma x, c, y)
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);

	// TODO: FMA nodes should have flags that propagate to the created nodes.
	// For now, create a Flags object for use with all unsafe math transforms.
	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);

	if (Options.UnsafeFPMath) {
	// (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
	if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
	isConstantFPBuildVectorOrConstantFP(N1) &&
	isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
	Flags), Flags);
	}

	// (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
	if (N0.getOpcode() == ISD::FMUL &&
	isConstantFPBuildVectorOrConstantFP(N1) &&
	isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
	return DAG.getNode(ISD::FMA, DL, VT,
	N0.getOperand(0),
	DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
	Flags),
	N2);
	}
	}

	// (fma x, 1, y) -> (fadd x, y)
	// (fma x, -1, y) -> (fadd (fneg x), y)
	if (N1CFP) {
	if (N1CFP->isExactlyValue(1.0))
	// TODO: The FMA node should have flags that propagate to this node.
	return DAG.getNode(ISD::FADD, DL, VT, N0, N2);

	if (N1CFP->isExactlyValue(-1.0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))) {
	SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
	AddToWorklist(RHSNeg.getNode());
	// TODO: The FMA node should have flags that propagate to this node.
	return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
	}

	// fma (fneg x), K, y -> fma x -K, y
	if (N0.getOpcode() == ISD::FNEG &&
	(TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	(N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) {
	return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
	DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
	}
	}

	if (Options.UnsafeFPMath) {
	// (fma x, c, x) -> (fmul x, (c+1))
	if (N1CFP && N0 == N2) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1,
	DAG.getConstantFP(1.0, DL, VT), Flags),
	Flags);
	}

	// (fma x, c, (fneg x)) -> (fmul x, (c-1))
	if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1,
	DAG.getConstantFP(-1.0, DL, VT), Flags),
	Flags);
	}
	}

	return SDValue();
	}

	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal.
	// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
	// Notice that this is not always beneficial. One reason is different targets
	// may have different costs for FDIV and FMUL, so sometimes the cost of two
	// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
	// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
	SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
	bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
	const SDNodeFlags Flags = N->getFlags();
	if (!UnsafeMath && !Flags.hasAllowReciprocal())
	return SDValue();

	// Skip if current node is a reciprocal.
	SDValue N0 = N->getOperand(0);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	if (N0CFP && N0CFP->isExactlyValue(1.0))
	return SDValue();

	// Exit early if the target does not want this transform or if there can't
	// possibly be enough uses of the divisor to make the transform worthwhile.
	SDValue N1 = N->getOperand(1);
	unsigned MinUses = TLI.combineRepeatedFPDivisors();
	if (!MinUses \|\| N1->use_size() < MinUses)
	return SDValue();

	// Find all FDIV users of the same divisor.
	// Use a set because duplicates may be present in the user list.
	SetVector<SDNode *> Users;
	for (auto *U : N1->uses()) {
	if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
	// This division is eligible for optimization only if global unsafe math
	// is enabled or if this division allows reciprocal formation.
	if (UnsafeMath \|\| U->getFlags().hasAllowReciprocal())
	Users.insert(U);
	}
	}

	// Now that we have the actual number of divisor uses, make sure it meets
	// the minimum threshold specified by the target.
	if (Users.size() < MinUses)
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
	SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);

	// Dividend / Divisor -> Dividend * Reciprocal
	for (auto *U : Users) {
	SDValue Dividend = U->getOperand(0);
	if (Dividend != FPOne) {
	SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
	Reciprocal, Flags);
	CombineTo(U, NewNode);
	} else if (U != Reciprocal.getNode()) {
	// In the absence of fast-math-flags, this user node is always the
	// same node as Reciprocal, but with FMF they may be different nodes.
	CombineTo(U, Reciprocal);
	}
	}
	return SDValue(N, 0); // N was replaced.
	}

	SDValue DAGCombiner::visitFDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fdiv c1, c2) -> c1/c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (Options.UnsafeFPMath) {
	// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
	if (N1CFP) {
	// Compute the reciprocal 1.0 / c2.
	const APFloat &N1APF = N1CFP->getValueAPF();
	APFloat Recip(N1APF.getSemantics(), 1); // 1.0
	APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
	// Only do the transform if the reciprocal is a legal fp immediate that
	// isn't too nasty (eg NaN, denormal, ...).
	if ((st == APFloat::opOK \|\| st == APFloat::opInexact) && // Not too nasty
	(!LegalOperations \|\|
	// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
	// backend)... we should handle this gracefully after Legalize.
	// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) \|\|
	TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	TLI.isFPImmLegal(Recip, VT)))
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getConstantFP(Recip, DL, VT), Flags);
	}

	// If this FDIV is part of a reciprocal square root, it may be folded
	// into a target-specific square root estimate instruction.
	if (N1.getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FP_EXTEND &&
	N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
	Flags)) {
	RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FP_ROUND &&
	N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
	Flags)) {
	RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FMUL) {
	// Look through an FMUL. Even though this won't remove the FDIV directly,
	// it's still worthwhile to get rid of the FSQRT if possible.
	SDValue SqrtOp;
	SDValue OtherOp;
	if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	SqrtOp = N1.getOperand(0);
	OtherOp = N1.getOperand(1);
	} else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
	SqrtOp = N1.getOperand(1);
	OtherOp = N1.getOperand(0);
	}
	if (SqrtOp.getNode()) {
	// We found a FSQRT, so try to make this fold:
	// x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
	if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
	RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	}
	}

	// Fold into a reciprocal estimate and multiply instead of a real divide.
	if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	}

	// (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
	if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
	if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
	// Both can be negated for free, check to see if at least one is cheaper
	// negated.
	if (LHSNeg == 2 \|\| RHSNeg == 2)
	return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
	GetNegatedExpression(N0, DAG, LegalOperations),
	GetNegatedExpression(N1, DAG, LegalOperations),
	Flags);
	}
	}

	if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
	return CombineRepeatedDivisors;

	return SDValue();
	}

	SDValue DAGCombiner::visitFREM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);

	// fold (frem c1, c2) -> fmod(c1,c2)
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFSQRT(SDNode *N) {
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (TLI.isFsqrtCheap(N0, DAG))
	return SDValue();

	// TODO: FSQRT nodes should have flags that propagate to the created nodes.
	// For now, create a Flags object for use with all unsafe math transforms.
	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);
	return buildSqrtEstimate(N0, Flags);
	}

	/// copysign(x, fp_extend(y)) -> copysign(x, y)
	/// copysign(x, fp_round(y)) -> copysign(x, y)
	static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
	SDValue N1 = N->getOperand(1);
	if ((N1.getOpcode() == ISD::FP_EXTEND \|\|
	N1.getOpcode() == ISD::FP_ROUND)) {
	// Do not optimize out type conversion of f128 type yet.
	// For some targets like x86_64, configuration is changed to keep one f128
	// value in one SSE register, but instruction selection cannot handle
	// FCOPYSIGN on SSE registers yet.
	EVT N1VT = N1->getValueType(0);
	EVT N1Op0VT = N1->getOperand(0).getValueType();
	return (N1VT == N1Op0VT \|\| N1Op0VT != MVT::f128);
	}
	return false;
	}

	SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);

	if (N0CFP && N1CFP) // Constant fold
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);

	if (N1CFP) {
	const APFloat &V = N1CFP->getValueAPF();
	// copysign(x, c1) -> fabs(x) iff ispos(c1)
	// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
	if (!V.isNegative()) {
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FABS, VT))
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
	} else {
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
	DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
	}
	}

	// copysign(fabs(x), y) -> copysign(x, y)
	// copysign(fneg(x), y) -> copysign(x, y)
	// copysign(copysign(x,z), y) -> copysign(x, y)
	if (N0.getOpcode() == ISD::FABS \|\| N0.getOpcode() == ISD::FNEG \|\|
	N0.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);

	// copysign(x, abs(y)) -> abs(x)
	if (N1.getOpcode() == ISD::FABS)
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);

	// copysign(x, copysign(y,z)) -> copysign(x, z)
	if (N1.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));

	// copysign(x, fp_extend(y)) -> copysign(x, y)
	// copysign(x, fp_round(y)) -> copysign(x, y)
	if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT OpVT = N0.getValueType();

	// fold (sint_to_fp c1) -> c1fp
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	// ...but only if the target supports immediate floating-point values
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);

	// If the input is a legal type, and SINT_TO_FP is not legal on this target,
	// but UINT_TO_FP is legal on this target, try to convert.
	if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
	TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
	// If the sign bit is known to be zero, we can change this to UINT_TO_FP.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
	}

	// The next optimizations are desirable only if SELECT_CC can be lowered.
	if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) \|\| !LegalOperations) {
	// fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
	if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
	!VT.isVector() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	SDValue Ops[] =
	{ N0.getOperand(0), N0.getOperand(1),
	DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
	N0.getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}

	// fold (sint_to_fp (zext (setcc x, y, cc))) ->
	// (select_cc x, y, 1.0, 0.0,, cc)
	if (N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	SDValue Ops[] =
	{ N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
	N0.getOperand(0).getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT OpVT = N0.getValueType();

	// fold (uint_to_fp c1) -> c1fp
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	// ...but only if the target supports immediate floating-point values
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
	return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);

	// If the input is a legal type, and UINT_TO_FP is not legal on this target,
	// but SINT_TO_FP is legal on this target, try to convert.
	if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
	TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
	// If the sign bit is known to be zero, we can change this to SINT_TO_FP.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
	}

	// The next optimizations are desirable only if SELECT_CC can be lowered.
	if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) \|\| !LegalOperations) {
	// fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
	if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	SDValue Ops[] =
	{ N0.getOperand(0), N0.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
	N0.getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}
	}

	return SDValue();
	}

	// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
	static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
	return SDValue();

	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
	bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;

	// We can safely assume the conversion won't overflow the output range,
	// because (for example) (uint8_t)18293.f is undefined behavior.

	// Since we can assume the conversion won't overflow, our decision as to
	// whether the input will fit in the float should depend on the minimum
	// of the input range and output range.

	// This means this is also safe for a signed input and unsigned output, since
	// a negative input would lead to undefined behavior.
	unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
	unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
	unsigned ActualSize = std::min(InputSize, OutputSize);
	const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());

	// We can only fold away the float conversion if the input range can be
	// represented exactly in the float range.
	if (APFloat::semanticsPrecision(sem) >= ActualSize) {
	if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
	unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
	}
	if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
	return DAG.getBitcast(VT, Src);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fp_to_sint c1fp) -> c1
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);

	return FoldIntToFPToInt(N, DAG);
	}

	SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fp_to_uint c1fp) -> c1
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);

	return FoldIntToFPToInt(N, DAG);
	}

	SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	EVT VT = N->getValueType(0);

	// fold (fp_round c1fp) -> c1fp
	if (N0CFP)
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);

	// fold (fp_round (fp_extend x)) -> x
	if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
	return N0.getOperand(0);

	// fold (fp_round (fp_round x)) -> (fp_round x)
	if (N0.getOpcode() == ISD::FP_ROUND) {
	const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
	const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;

	// Skip this folding if it results in an fp_round from f80 to f16.
	//
	// f80 to f16 always generates an expensive (and as yet, unimplemented)
	// libcall to __truncxfhf2 instead of selecting native f16 conversion
	// instructions from f32 or f64. Moreover, the first (value-preserving)
	// fp_round from f80 to either f32 or f64 may become a NOP in platforms like
	// x86.
	if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
	return SDValue();

	// If the first fp_round isn't a value preserving truncation, it might
	// introduce a tie in the second fp_round, that wouldn't occur in the
	// single-step fp_round we want to fold to.
	// In other words, double rounding isn't the same as rounding.
	// Also, this is a value preserving truncation iff both fp_round's are.
	if (DAG.getTarget().Options.UnsafeFPMath \|\| N0IsTrunc) {
	SDLoc DL(N);
	return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
	DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
	}
	}

	// fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
	if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
	SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
	N0.getOperand(0), N1);
	AddToWorklist(Tmp.getNode());
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
	Tmp, N0.getOperand(1));
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);

	// fold (fp_round_inreg c1fp) -> c1fp
	if (N0CFP && isTypeLegal(EVT)) {
	SDLoc DL(N);
	SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT);
	return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
	if (N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::FP_ROUND)
	return SDValue();

	// fold (fp_extend c1fp) -> c1fp
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);

	// fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
	if (N0.getOpcode() == ISD::FP16_TO_FP &&
	TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
	return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));

	// Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
	// value of X.
	if (N0.getOpcode() == ISD::FP_ROUND
	&& N0.getConstantOperandVal(1) == 1) {
	SDValue In = N0.getOperand(0);
	if (In.getValueType() == VT) return In;
	if (VT.bitsLT(In.getValueType()))
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
	In, N0.getOperand(1));
	return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
	}

	// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
	N0.getValueType(), ExtLoad,
	DAG.getIntPtrConstant(1, SDLoc(N0))),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFCEIL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fceil c1) -> fceil(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ftrunc c1) -> ftrunc(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);

	// fold ftrunc (known rounded int x) -> x
	// ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
	// likely to be generated to extract integer from a rounded floating value.
	switch (N0.getOpcode()) {
	default: break;
	case ISD::FRINT:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FFLOOR:
	case ISD::FCEIL:
	return N0;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ffloor c1) -> ffloor(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);

	return SDValue();
	}

	// FIXME: FNEG and FABS have a lot in common; refactor.
	SDValue DAGCombiner::visitFNEG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// Constant fold FNEG.
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);

	if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
	&DAG.getTarget().Options))
	return GetNegatedExpression(N0, DAG, LegalOperations);

	// Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
	// constant pool values.
	if (!TLI.isFNegFree(VT) &&
	N0.getOpcode() == ISD::BITCAST &&
	N0.getNode()->hasOneUse()) {
	SDValue Int = N0.getOperand(0);
	EVT IntVT = Int.getValueType();
	if (IntVT.isInteger() && !IntVT.isVector()) {
	APInt SignMask;
	if (N0.getValueType().isVector()) {
	// For a vector, get a mask such as 0x80... per scalar element
	// and splat it.
	SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
	SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
	} else {
	// For a scalar, just generate 0x80...
	SignMask = APInt::getSignMask(IntVT.getSizeInBits());
	}
	SDLoc DL0(N0);
	Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
	DAG.getConstant(SignMask, DL0, IntVT));
	AddToWorklist(Int.getNode());
	return DAG.getBitcast(VT, Int);
	}
	}

	// (fneg (fmul c, x)) -> (fmul -c, x)
	if (N0.getOpcode() == ISD::FMUL &&
	(N0.getNode()->hasOneUse() \|\| !TLI.isFNegFree(VT))) {
	ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
	if (CFP1) {
	APFloat CVal = CFP1->getValueAPF();
	CVal.changeSign();
	if (Level >= AfterLegalizeDAG &&
	(TLI.isFPImmLegal(CVal, VT) \|\|
	TLI.isOperationLegal(ISD::ConstantFP, VT)))
	return DAG.getNode(
	ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
	DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
	N0->getFlags());
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);

	if (N0CFP && N1CFP) {
	const APFloat &C0 = N0CFP->getValueAPF();
	const APFloat &C1 = N1CFP->getValueAPF();
	return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
	}

	// Canonicalize to constant on RHS.
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);

	if (N0CFP && N1CFP) {
	const APFloat &C0 = N0CFP->getValueAPF();
	const APFloat &C1 = N1CFP->getValueAPF();
	return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
	}

	// Canonicalize to constant on RHS.
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFABS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fabs c1) -> fabs(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);

	// fold (fabs (fabs x)) -> (fabs x)
	if (N0.getOpcode() == ISD::FABS)
	return N->getOperand(0);

	// fold (fabs (fneg x)) -> (fabs x)
	// fold (fabs (fcopysign x, y)) -> (fabs x)
	if (N0.getOpcode() == ISD::FNEG \|\| N0.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));

	// Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
	// constant pool values.
	if (!TLI.isFAbsFree(VT) &&
	N0.getOpcode() == ISD::BITCAST &&
	N0.getNode()->hasOneUse()) {
	SDValue Int = N0.getOperand(0);
	EVT IntVT = Int.getValueType();
	if (IntVT.isInteger() && !IntVT.isVector()) {
	APInt SignMask;
	if (N0.getValueType().isVector()) {
	// For a vector, get a mask such as 0x7f... per scalar element
	// and splat it.
	SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
	SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
	} else {
	// For a scalar, just generate 0x7f...
	SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
	}
	SDLoc DL(N0);
	Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
	DAG.getConstant(SignMask, DL, IntVT));
	AddToWorklist(Int.getNode());
	return DAG.getBitcast(N->getValueType(0), Int);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBRCOND(SDNode *N) {
	SDValue Chain = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);

	// If N is a constant we could fold this into a fallthrough or unconditional
	// branch. However that doesn't happen very often in normal code, because
	// Instcombine/SimplifyCFG should have handled the available opportunities.
	// If we did this folding here, it would be necessary to update the
	// MachineBasicBlock CFG, which is awkward.

	// fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
	// on the target.
	if (N1.getOpcode() == ISD::SETCC &&
	TLI.isOperationLegalOrCustom(ISD::BR_CC,
	N1.getOperand(0).getValueType())) {
	return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
	Chain, N1.getOperand(2),
	N1.getOperand(0), N1.getOperand(1), N2);
	}

	if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) \|\|
	((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) &&
	(N1.getOperand(0).hasOneUse() &&
	N1.getOperand(0).getOpcode() == ISD::SRL))) {
	SDNode *Trunc = nullptr;
	if (N1.getOpcode() == ISD::TRUNCATE) {
	// Look pass the truncate.
	Trunc = N1.getNode();
	N1 = N1.getOperand(0);
	}

	// Match this pattern so that we can generate simpler code:
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = srl i32 %b, 1
	// brcond i32 %c ...
	//
	// into
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = setcc eq %b, 0
	// brcond %c ...
	//
	// This applies only when the AND constant value has one bit set and the
	// SRL constant is equal to the log2 of the AND constant. The back-end is
	// smart enough to convert the result into a TEST/JMP sequence.
	SDValue Op0 = N1.getOperand(0);
	SDValue Op1 = N1.getOperand(1);

	if (Op0.getOpcode() == ISD::AND &&
	Op1.getOpcode() == ISD::Constant) {
	SDValue AndOp1 = Op0.getOperand(1);

	if (AndOp1.getOpcode() == ISD::Constant) {
	const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();

	if (AndConst.isPowerOf2() &&
	cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
	SDLoc DL(N);
	SDValue SetCC =
	DAG.getSetCC(DL,
	getSetCCResultType(Op0.getValueType()),
	Op0, DAG.getConstant(0, DL, Op0.getValueType()),
	ISD::SETNE);

	SDValue NewBRCond = DAG.getNode(ISD::BRCOND, DL,
	MVT::Other, Chain, SetCC, N2);
	// Don't add the new BRCond into the worklist or else SimplifySelectCC
	// will convert it back to (X & C1) >> C2.
	CombineTo(N, NewBRCond, false);
	// Truncate is dead.
	if (Trunc)
	deleteAndRecombine(Trunc);
	// Replace the uses of SRL with SETCC
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
	deleteAndRecombine(N1.getNode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	}

	if (Trunc)
	// Restore N1 if the above transformation doesn't match.
	N1 = N->getOperand(1);
	}

	// Transform br(xor(x, y)) -> br(x != y)
	// Transform br(xor(xor(x,y), 1)) -> br (x == y)
	if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) {
	SDNode *TheXor = N1.getNode();
	SDValue Op0 = TheXor->getOperand(0);
	SDValue Op1 = TheXor->getOperand(1);
	if (Op0.getOpcode() == Op1.getOpcode()) {
	// Avoid missing important xor optimizations.
	if (SDValue Tmp = visitXOR(TheXor)) {
	if (Tmp.getNode() != TheXor) {
	DEBUG(dbgs() << "\nReplacing.8 ";
	TheXor->dump(&DAG);
	dbgs() << "\nWith: ";
	Tmp.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
	deleteAndRecombine(TheXor);
	return DAG.getNode(ISD::BRCOND, SDLoc(N),
	MVT::Other, Chain, Tmp, N2);
	}

	// visitXOR has changed XOR's operands or replaced the XOR completely,
	// bail out.
	return SDValue(N, 0);
	}
	}

	if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
	bool Equal = false;
	if (isOneConstant(Op0) && Op0.hasOneUse() &&
	Op0.getOpcode() == ISD::XOR) {
	TheXor = Op0.getNode();
	Equal = true;
	}

	EVT SetCCVT = N1.getValueType();
	if (LegalTypes)
	SetCCVT = getSetCCResultType(SetCCVT);
	SDValue SetCC = DAG.getSetCC(SDLoc(TheXor),
	SetCCVT,
	Op0, Op1,
	Equal ? ISD::SETEQ : ISD::SETNE);
	// Replace the uses of XOR with SETCC
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
	deleteAndRecombine(N1.getNode());
	return DAG.getNode(ISD::BRCOND, SDLoc(N),
	MVT::Other, Chain, SetCC, N2);
	}
	}

	return SDValue();
	}

	// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
	//
	SDValue DAGCombiner::visitBR_CC(SDNode *N) {
	CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
	SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);

	// If N is a constant we could fold this into a fallthrough or unconditional
	// branch. However that doesn't happen very often in normal code, because
	// Instcombine/SimplifyCFG should have handled the available opportunities.
	// If we did this folding here, it would be necessary to update the
	// MachineBasicBlock CFG, which is awkward.

	// Use SimplifySetCC to simplify SETCC's.
	SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
	CondLHS, CondRHS, CC->get(), SDLoc(N),
	false);
	if (Simp.getNode()) AddToWorklist(Simp.getNode());

	// fold to a simpler setcc
	if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
	return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
	N->getOperand(0), Simp.getOperand(2),
	Simp.getOperand(0), Simp.getOperand(1),
	N->getOperand(4));

	return SDValue();
	}

	/// Return true if 'Use' is a load or a store that uses N as its base pointer
	/// and that N may be folded in the load / store addressing mode.
	static bool canFoldInAddressingMode(SDNode N, SDNode Use,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	EVT VT;
	unsigned AS;

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
	if (LD->isIndexed() \|\| LD->getBasePtr().getNode() != N)
	return false;
	VT = LD->getMemoryVT();
	AS = LD->getAddressSpace();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
	if (ST->isIndexed() \|\| ST->getBasePtr().getNode() != N)
	return false;
	VT = ST->getMemoryVT();
	AS = ST->getAddressSpace();
	} else
	return false;

	TargetLowering::AddrMode AM;
	if (N->getOpcode() == ISD::ADD) {
	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (Offset)
	// [reg +/- imm]
	AM.BaseOffs = Offset->getSExtValue();
	else
	// [reg +/- reg]
	AM.Scale = 1;
	} else if (N->getOpcode() == ISD::SUB) {
	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (Offset)
	// [reg +/- imm]
	AM.BaseOffs = -Offset->getSExtValue();
	else
	// [reg +/- reg]
	AM.Scale = 1;
	} else
	return false;

	return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
	VT.getTypeForEVT(*DAG.getContext()), AS);
	}

	/// Try turning a load/store into a pre-indexed load/store when the base
	/// pointer is an add or subtract and it has other uses besides the load/store.
	/// After the transformation, the new indexed load/store has effectively folded
	/// the add/subtract in and all of its other uses are redirected to the
	/// new load/store.
	bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	if (LD->isIndexed())
	return false;
	VT = LD->getMemoryVT();
	if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
	!TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
	return false;
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	if (ST->isIndexed())
	return false;
	VT = ST->getMemoryVT();
	if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
	!TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
	return false;
	Ptr = ST->getBasePtr();
	isLoad = false;
	} else {
	return false;
	}

	// If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
	// out. There is no reason to make this a preinc/predec.
	if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) \|\|
	Ptr.getNode()->hasOneUse())
	return false;

	// Ask the target to do addressing mode selection.
	SDValue BasePtr;
	SDValue Offset;
	ISD::MemIndexedMode AM = ISD::UNINDEXED;
	if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
	return false;

	// Backends without true r+i pre-indexed forms may need to pass a
	// constant base with a variable offset so that constant coercion
	// will work with the patterns in canonical form.
	bool Swapped = false;
	if (isa<ConstantSDNode>(BasePtr)) {
	std::swap(BasePtr, Offset);
	Swapped = true;
	}

	// Don't create a indexed load / store with zero offset.
	if (isNullConstant(Offset))
	return false;

	// Try turning it into a pre-indexed load / store except when:
	// 1) The new base ptr is a frame index.
	// 2) If N is a store and the new base ptr is either the same as or is a
	// predecessor of the value being stored.
	// 3) Another use of old base ptr is a predecessor of N. If ptr is folded
	// that would create a cycle.
	// 4) All uses are load / store ops that use it as old base ptr.

	// Check #1. Preinc'ing a frame index would require copying the stack pointer
	// (plus the implicit offset) to a register to preinc anyway.
	if (isa<FrameIndexSDNode>(BasePtr) \|\| isa<RegisterSDNode>(BasePtr))
	return false;

	// Check #2.
	if (!isLoad) {
	SDValue Val = cast<StoreSDNode>(N)->getValue();
	if (Val == BasePtr \|\| BasePtr.getNode()->isPredecessorOf(Val.getNode()))
	return false;
	}

	// Caches for hasPredecessorHelper.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(N);

	// If the offset is a constant, there may be other adds of constants that
	// can be folded with this one. We should do this to avoid having to keep
	// a copy of the original base pointer.
	SmallVector<SDNode *, 16> OtherUses;
	if (isa<ConstantSDNode>(Offset))
	for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
	UE = BasePtr.getNode()->use_end();
	UI != UE; ++UI) {
	SDUse &Use = UI.getUse();
	// Skip the use that is Ptr and uses of other results from BasePtr's
	// node (important for nodes that return multiple results).
	if (Use.getUser() == Ptr.getNode() \|\| Use != BasePtr)
	continue;

	if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
	continue;

	if (Use.getUser()->getOpcode() != ISD::ADD &&
	Use.getUser()->getOpcode() != ISD::SUB) {
	OtherUses.clear();
	break;
	}

	SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
	if (!isa<ConstantSDNode>(Op1)) {
	OtherUses.clear();
	break;
	}

	// FIXME: In some cases, we can be smarter about this.
	if (Op1.getValueType() != Offset.getValueType()) {
	OtherUses.clear();
	break;
	}

	OtherUses.push_back(Use.getUser());
	}

	if (Swapped)
	std::swap(BasePtr, Offset);

	// Now check for #3 and #4.
	bool RealUse = false;

	for (SDNode *Use : Ptr.getNode()->uses()) {
	if (Use == N)
	continue;
	if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
	return false;

	// If Ptr may be folded in addressing mode of other use, then it's
	// not profitable to do this transformation.
	if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
	RealUse = true;
	}

	if (!RealUse)
	return false;

	SDValue Result;
	if (isLoad)
	Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM);
	else
	Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM);
	++PreIndexedNodes;
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.4 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	if (isLoad) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
	} else {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
	}

	// Finally, since the node is now dead, remove it from the graph.
	deleteAndRecombine(N);

	if (Swapped)
	std::swap(BasePtr, Offset);

	// Replace other uses of BasePtr that can be updated to use Ptr
	for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
	unsigned OffsetIdx = 1;
	if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
	OffsetIdx = 0;
	assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
	BasePtr.getNode() && "Expected BasePtr operand");

	// We need to replace ptr0 in the following expression:
	// x0 * offset0 + y0 * ptr0 = t0
	// knowing that
	// x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
	//
	// where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
	// indexed load/store and the expression that needs to be re-written.
	//
	// Therefore, we have:
	// t0 = (x0 * offset0 - x1 * y0 * y1 offset1) + (y0 y1) * t1

	ConstantSDNode *CN =
	cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
	int X0, X1, Y0, Y1;
	const APInt &Offset0 = CN->getAPIntValue();
	APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();

	X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
	Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
	X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
	Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;

	unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;

	APInt CNV = Offset0;
	if (X0 < 0) CNV = -CNV;
	if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
	else CNV = CNV - Offset1;

	SDLoc DL(OtherUses[i]);

	// We can now generate the new expression.
	SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
	SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);

	SDValue NewUse = DAG.getNode(Opcode,
	DL,
	OtherUses[i]->getValueType(0), NewOp1, NewOp2);
	DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
	deleteAndRecombine(OtherUses[i]);
	}

	// Replace the uses of Ptr with uses of the updated base value.
	DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
	deleteAndRecombine(Ptr.getNode());
	AddToWorklist(Result.getNode());

	return true;
	}

	/// Try to combine a load/store with a add/sub of the base pointer node into a
	/// post-indexed load/store. The transformation folded the add/subtract into the
	/// new indexed load/store effectively and all of its uses are redirected to the
	/// new load/store.
	bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	if (LD->isIndexed())
	return false;
	VT = LD->getMemoryVT();
	if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
	!TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
	return false;
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	if (ST->isIndexed())
	return false;
	VT = ST->getMemoryVT();
	if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
	!TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
	return false;
	Ptr = ST->getBasePtr();
	isLoad = false;
	} else {
	return false;
	}

	if (Ptr.getNode()->hasOneUse())
	return false;

	for (SDNode *Op : Ptr.getNode()->uses()) {
	if (Op == N \|\|
	(Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
	continue;

	SDValue BasePtr;
	SDValue Offset;
	ISD::MemIndexedMode AM = ISD::UNINDEXED;
	if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
	// Don't create a indexed load / store with zero offset.
	if (isNullConstant(Offset))
	continue;

	// Try turning it into a post-indexed load / store except when
	// 1) All uses are load / store ops that use it as base ptr (and
	// it may be folded as addressing mmode).
	// 2) Op must be independent of N, i.e. Op is neither a predecessor
	// nor a successor of N. Otherwise, if Op is folded that would
	// create a cycle.

	if (isa<FrameIndexSDNode>(BasePtr) \|\| isa<RegisterSDNode>(BasePtr))
	continue;

	// Check for #1.
	bool TryNext = false;
	for (SDNode *Use : BasePtr.getNode()->uses()) {
	if (Use == Ptr.getNode())
	continue;

	// If all the uses are load / store addresses, then don't do the
	// transformation.
	if (Use->getOpcode() == ISD::ADD \|\| Use->getOpcode() == ISD::SUB){
	bool RealUse = false;
	for (SDNode *UseUse : Use->uses()) {
	if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
	RealUse = true;
	}

	if (!RealUse) {
	TryNext = true;
	break;
	}
	}
	}

	if (TryNext)
	continue;

	// Check for #2
	if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
	SDValue Result = isLoad
	? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM)
	: DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM);
	++PostIndexedNodes;
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.5 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	if (isLoad) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
	} else {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
	}

	// Finally, since the node is now dead, remove it from the graph.
	deleteAndRecombine(N);

	// Replace the uses of Use with uses of the updated base value.
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
	Result.getValue(isLoad ? 1 : 0));
	deleteAndRecombine(Op);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Return the base-pointer arithmetic from an indexed \p LD.
	SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
	ISD::MemIndexedMode AM = LD->getAddressingMode();
	assert(AM != ISD::UNINDEXED);
	SDValue BP = LD->getOperand(1);
	SDValue Inc = LD->getOperand(2);

	// Some backends use TargetConstants for load offsets, but don't expect
	// TargetConstants in general ADD nodes. We can convert these constants into
	// regular Constants (if the constant is not opaque).
	assert((Inc.getOpcode() != ISD::TargetConstant \|\|
	!cast<ConstantSDNode>(Inc)->isOpaque()) &&
	"Cannot split out indexing using opaque target constants");
	if (Inc.getOpcode() == ISD::TargetConstant) {
	ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
	Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
	ConstInc->getValueType(0));
	}

	unsigned Opc =
	(AM == ISD::PRE_INC \|\| AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
	return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
	}

	SDValue DAGCombiner::visitLOAD(SDNode *N) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();

	// If load is not volatile and there are no uses of the loaded value (and
	// the updated indexed value in case of indexed loads), change uses of the
	// chain value into uses of the chain input (i.e. delete the dead load).
	if (!LD->isVolatile()) {
	if (N->getValueType(1) == MVT::Other) {
	// Unindexed loads.
	if (!N->hasAnyUseOfValue(0)) {
	// It's not safe to use the two value CombineTo variant here. e.g.
	// v1, chain2 = load chain1, loc
	// v2, chain3 = load chain2, loc
	// v3 = add v2, c
	// Now we replace use of chain2 with chain1. This makes the second load
	// isomorphic to the one we are deleting, and thus makes this load live.
	DEBUG(dbgs() << "\nReplacing.6 ";
	N->dump(&DAG);
	dbgs() << "\nWith chain: ";
	Chain.getNode()->dump(&DAG);
	dbgs() << "\n");
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
	AddUsersToWorklist(Chain.getNode());
	if (N->use_empty())
	deleteAndRecombine(N);

	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	} else {
	// Indexed loads.
	assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");

	// If this load has an opaque TargetConstant offset, then we cannot split
	// the indexing into an add/sub directly (that TargetConstant may not be
	// valid for a different type of node, and we cannot convert an opaque
	// target constant into a regular constant).
	bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
	cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();

	if (!N->hasAnyUseOfValue(0) &&
	((MaySplitLoadIndex && !HasOTCInc) \|\| !N->hasAnyUseOfValue(1))) {
	SDValue Undef = DAG.getUNDEF(N->getValueType(0));
	SDValue Index;
	if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
	Index = SplitIndexingFromLoad(LD);
	// Try to fold the base pointer arithmetic into subsequent loads and
	// stores.
	AddUsersToWorklist(N);
	} else
	Index = DAG.getUNDEF(N->getValueType(1));
	DEBUG(dbgs() << "\nReplacing.7 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	Undef.getNode()->dump(&DAG);
	dbgs() << " and 2 other values\n");
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
	deleteAndRecombine(N);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	}

	// If this load is directly stored, replace the load value with the stored
	// value.
	// TODO: Handle store large -> read small portion.
	// TODO: Handle TRUNCSTORE/LOADEXT
	if (OptLevel != CodeGenOpt::None &&
	ISD::isNormalLoad(N) && !LD->isVolatile()) {
	if (ISD::isNON_TRUNCStore(Chain.getNode())) {
	StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
	if (PrevST->getBasePtr() == Ptr &&
	PrevST->getValue().getValueType() == N->getValueType(0))
	return CombineTo(N, PrevST->getOperand(1), Chain);
	}
	}

	// Try to infer better alignment information than the load already has.
	if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
	if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
	if (Align > LD->getMemOperand()->getBaseAlignment()) {
	SDValue NewLoad = DAG.getExtLoad(
	LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), LD->getMemoryVT(), Align,
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	if (NewLoad.getNode() != N)
	return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
	}
	}
	}

	if (LD->isUnindexed()) {
	// Walk up chain skipping non-aliasing memory nodes.
	SDValue BetterChain = FindBetterChain(N, Chain);

	// If there is a better chain.
	if (Chain != BetterChain) {
	SDValue ReplLoad;

	// Replace the chain to void dependency.
	if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
	ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
	BetterChain, Ptr, LD->getMemOperand());
	} else {
	ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
	LD->getValueType(0),
	BetterChain, Ptr, LD->getMemoryVT(),
	LD->getMemOperand());
	}

	// Create token factor to keep old chain connected.
	SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
	MVT::Other, Chain, ReplLoad.getValue(1));

	// Replace uses with load result and token factor
	return CombineTo(N, ReplLoad.getValue(0), Token);
	}
	}

	// Try transforming N to an indexed load.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	// Try to slice up N to more direct loads if the slices are mapped to
	// different register banks or pairing can take place.
	if (SliceUpLoad(N))
	return SDValue(N, 0);

	return SDValue();
	}

	namespace {

	/// \brief Helper structure used to slice a load in smaller loads.
	/// Basically a slice is obtained from the following sequence:
	/// Origin = load Ty1, Base
	/// Shift = srl Ty1 Origin, CstTy Amount
	/// Inst = trunc Shift to Ty2
	///
	/// Then, it will be rewritten into:
	/// Slice = load SliceTy, Base + SliceOffset
	/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
	///
	/// SliceTy is deduced from the number of bits that are actually used to
	/// build Inst.
	struct LoadedSlice {
	/// \brief Helper structure used to compute the cost of a slice.
	struct Cost {
	/// Are we optimizing for code size.
	bool ForCodeSize;

	/// Various cost.
	unsigned Loads = 0;
	unsigned Truncates = 0;
	unsigned CrossRegisterBanksCopies = 0;
	unsigned ZExts = 0;
	unsigned Shift = 0;

	Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}

	/// \brief Get the cost of one isolated slice.
	Cost(const LoadedSlice &LS, bool ForCodeSize = false)
	: ForCodeSize(ForCodeSize), Loads(1) {
	EVT TruncType = LS.Inst->getValueType(0);
	EVT LoadedType = LS.getLoadedType();
	if (TruncType != LoadedType &&
	!LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
	ZExts = 1;
	}

	/// \brief Account for slicing gain in the current cost.
	/// Slicing provide a few gains like removing a shift or a
	/// truncate. This method allows to grow the cost of the original
	/// load with the gain from this slice.
	void addSliceGain(const LoadedSlice &LS) {
	// Each slice saves a truncate.
	const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
	if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
	LS.Inst->getValueType(0)))
	++Truncates;
	// If there is a shift amount, this slice gets rid of it.
	if (LS.Shift)
	++Shift;
	// If this slice can merge a cross register bank copy, account for it.
	if (LS.canMergeExpensiveCrossRegisterBankCopy())
	++CrossRegisterBanksCopies;
	}

	Cost &operator+=(const Cost &RHS) {
	Loads += RHS.Loads;
	Truncates += RHS.Truncates;
	CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
	ZExts += RHS.ZExts;
	Shift += RHS.Shift;
	return *this;
	}

	bool operator==(const Cost &RHS) const {
	return Loads == RHS.Loads && Truncates == RHS.Truncates &&
	CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
	ZExts == RHS.ZExts && Shift == RHS.Shift;
	}

	bool operator!=(const Cost &RHS) const { return !(*this == RHS); }

	bool operator<(const Cost &RHS) const {
	// Assume cross register banks copies are as expensive as loads.
	// FIXME: Do we want some more target hooks?
	unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
	unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
	// Unless we are optimizing for code size, consider the
	// expensive operation first.
	if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
	return ExpensiveOpsLHS < ExpensiveOpsRHS;
	return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
	(RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
	}

	bool operator>(const Cost &RHS) const { return RHS < *this; }

	bool operator<=(const Cost &RHS) const { return !(RHS < *this); }

	bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
	};

	// The last instruction that represent the slice. This should be a
	// truncate instruction.
	SDNode *Inst;

	// The original load instruction.
	LoadSDNode *Origin;

	// The right shift amount in bits from the original load.
	unsigned Shift;

	// The DAG from which Origin came from.
	// This is used to get some contextual information about legal types, etc.
	SelectionDAG *DAG;

	LoadedSlice(SDNode Inst = nullptr, LoadSDNode Origin = nullptr,
	unsigned Shift = 0, SelectionDAG *DAG = nullptr)
	: Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}

	/// \brief Get the bits used in a chunk of bits \p BitWidth large.
	/// \return Result is \p BitWidth and has used bits set to 1 and
	/// not used bits set to 0.
	APInt getUsedBits() const {
	// Reproduce the trunc(lshr) sequence:
	// - Start from the truncated value.
	// - Zero extend to the desired bit width.
	// - Shift left.
	assert(Origin && "No original load to compare against.");
	unsigned BitWidth = Origin->getValueSizeInBits(0);
	assert(Inst && "This slice is not bound to an instruction");
	assert(Inst->getValueSizeInBits(0) <= BitWidth &&
	"Extracted slice is bigger than the whole type!");
	APInt UsedBits(Inst->getValueSizeInBits(0), 0);
	UsedBits.setAllBits();
	UsedBits = UsedBits.zext(BitWidth);
	UsedBits <<= Shift;
	return UsedBits;
	}

	/// \brief Get the size of the slice to be loaded in bytes.
	unsigned getLoadedSize() const {
	unsigned SliceSize = getUsedBits().countPopulation();
	assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
	return SliceSize / 8;
	}

	/// \brief Get the type that will be loaded for this slice.
	/// Note: This may not be the final type for the slice.
	EVT getLoadedType() const {
	assert(DAG && "Missing context");
	LLVMContext &Ctxt = *DAG->getContext();
	return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
	}

	/// \brief Get the alignment of the load used for this slice.
	unsigned getAlignment() const {
	unsigned Alignment = Origin->getAlignment();
	unsigned Offset = getOffsetFromBase();
	if (Offset != 0)
	Alignment = MinAlign(Alignment, Alignment + Offset);
	return Alignment;
	}

	/// \brief Check if this slice can be rewritten with legal operations.
	bool isLegal() const {
	// An invalid slice is not legal.
	if (!Origin \|\| !Inst \|\| !DAG)
	return false;

	// Offsets are for indexed load only, we do not handle that.
	if (!Origin->getOffset().isUndef())
	return false;

	const TargetLowering &TLI = DAG->getTargetLoweringInfo();

	// Check that the type is legal.
	EVT SliceType = getLoadedType();
	if (!TLI.isTypeLegal(SliceType))
	return false;

	// Check that the load is legal for this type.
	if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
	return false;

	// Check that the offset can be computed.
	// 1. Check its type.
	EVT PtrType = Origin->getBasePtr().getValueType();
	if (PtrType == MVT::Untyped \|\| PtrType.isExtended())
	return false;

	// 2. Check that it fits in the immediate.
	if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
	return false;

	// 3. Check that the computation is legal.
	if (!TLI.isOperationLegal(ISD::ADD, PtrType))
	return false;

	// Check that the zext is legal if it needs one.
	EVT TruncateType = Inst->getValueType(0);
	if (TruncateType != SliceType &&
	!TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
	return false;

	return true;
	}

	/// \brief Get the offset in bytes of this slice in the original chunk of
	/// bits.
	/// \pre DAG != nullptr.
	uint64_t getOffsetFromBase() const {
	assert(DAG && "Missing context.");
	bool IsBigEndian = DAG->getDataLayout().isBigEndian();
	assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
	uint64_t Offset = Shift / 8;
	unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
	assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
	"The size of the original loaded type is not a multiple of a"
	" byte.");
	// If Offset is bigger than TySizeInBytes, it means we are loading all
	// zeros. This should have been optimized before in the process.
	assert(TySizeInBytes > Offset &&
	"Invalid shift amount for given loaded size");
	if (IsBigEndian)
	Offset = TySizeInBytes - Offset - getLoadedSize();
	return Offset;
	}

	/// \brief Generate the sequence of instructions to load the slice
	/// represented by this object and redirect the uses of this slice to
	/// this new sequence of instructions.
	/// \pre this->Inst && this->Origin are valid Instructions and this
	/// object passed the legal check: LoadedSlice::isLegal returned true.
	/// \return The last instruction of the sequence used to load the slice.
	SDValue loadSlice() const {
	assert(Inst && Origin && "Unable to replace a non-existing slice.");
	const SDValue &OldBaseAddr = Origin->getBasePtr();
	SDValue BaseAddr = OldBaseAddr;
	// Get the offset in that chunk of bytes w.r.t. the endianness.
	int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
	assert(Offset >= 0 && "Offset too big to fit in int64_t!");
	if (Offset) {
	// BaseAddr = BaseAddr + Offset.
	EVT ArithType = BaseAddr.getValueType();
	SDLoc DL(Origin);
	BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
	DAG->getConstant(Offset, DL, ArithType));
	}

	// Create the type of the loaded slice according to its size.
	EVT SliceType = getLoadedType();

	// Create the load for the slice.
	SDValue LastInst =
	DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
	Origin->getPointerInfo().getWithOffset(Offset),
	getAlignment(), Origin->getMemOperand()->getFlags());
	// If the final type is not the same as the loaded type, this means that
	// we have to pad with zero. Create a zero extend for that.
	EVT FinalType = Inst->getValueType(0);
	if (SliceType != FinalType)
	LastInst =
	DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
	return LastInst;
	}

	/// \brief Check if this slice can be merged with an expensive cross register
	/// bank copy. E.g.,
	/// i = load i32
	/// f = bitcast i32 i to float
	bool canMergeExpensiveCrossRegisterBankCopy() const {
	if (!Inst \|\| !Inst->hasOneUse())
	return false;
	SDNode Use = Inst->use_begin();
	if (Use->getOpcode() != ISD::BITCAST)
	return false;
	assert(DAG && "Missing context");
	const TargetLowering &TLI = DAG->getTargetLoweringInfo();
	EVT ResVT = Use->getValueType(0);
	const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
	const TargetRegisterClass *ArgRC =
	TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
	if (ArgRC == ResRC \|\| !TLI.isOperationLegal(ISD::LOAD, ResVT))
	return false;

	// At this point, we know that we perform a cross-register-bank copy.
	// Check if it is expensive.
	const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
	// Assume bitcasts are cheap, unless both register classes do not
	// explicitly share a common sub class.
	if (!TRI \|\| TRI->getCommonSubClass(ArgRC, ResRC))
	return false;

	// Check if it will be merged with the load.
	// 1. Check the alignment constraint.
	unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(
	ResVT.getTypeForEVT(*DAG->getContext()));

	if (RequiredAlignment > getAlignment())
	return false;

	// 2. Check that the load is a legal operation for that type.
	if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
	return false;

	// 3. Check that we do not have a zext in the way.
	if (Inst->getValueType(0) != getLoadedType())
	return false;

	return true;
	}
	};

	} // end anonymous namespace

	/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
	/// \p UsedBits looks like 0..0 1..1 0..0.
	static bool areUsedBitsDense(const APInt &UsedBits) {
	// If all the bits are one, this is dense!
	if (UsedBits.isAllOnesValue())
	return true;

	// Get rid of the unused bits on the right.
	APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
	// Get rid of the unused bits on the left.
	if (NarrowedUsedBits.countLeadingZeros())
	NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
	// Check that the chunk of bits is completely used.
	return NarrowedUsedBits.isAllOnesValue();
	}

	/// \brief Check whether or not \p First and \p Second are next to each other
	/// in memory. This means that there is no hole between the bits loaded
	/// by \p First and the bits loaded by \p Second.
	static bool areSlicesNextToEachOther(const LoadedSlice &First,
	const LoadedSlice &Second) {
	assert(First.Origin == Second.Origin && First.Origin &&
	"Unable to match different memory origins.");
	APInt UsedBits = First.getUsedBits();
	assert((UsedBits & Second.getUsedBits()) == 0 &&
	"Slices are not supposed to overlap.");
	UsedBits \|= Second.getUsedBits();
	return areUsedBitsDense(UsedBits);
	}

	/// \brief Adjust the \p GlobalLSCost according to the target
	/// paring capabilities and the layout of the slices.
	/// \pre \p GlobalLSCost should account for at least as many loads as
	/// there is in the slices in \p LoadedSlices.
	static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
	LoadedSlice::Cost &GlobalLSCost) {
	unsigned NumberOfSlices = LoadedSlices.size();
	// If there is less than 2 elements, no pairing is possible.
	if (NumberOfSlices < 2)
	return;

	// Sort the slices so that elements that are likely to be next to each
	// other in memory are next to each other in the list.
	std::sort(LoadedSlices.begin(), LoadedSlices.end(),
	[](const LoadedSlice &LHS, const LoadedSlice &RHS) {
	assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
	return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
	});
	const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
	// First (resp. Second) is the first (resp. Second) potentially candidate
	// to be placed in a paired load.
	const LoadedSlice *First = nullptr;
	const LoadedSlice *Second = nullptr;
	for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
	// Set the beginning of the pair.
	First = Second) {
	Second = &LoadedSlices[CurrSlice];

	// If First is NULL, it means we start a new pair.
	// Get to the next slice.
	if (!First)
	continue;

	EVT LoadedType = First->getLoadedType();

	// If the types of the slices are different, we cannot pair them.
	if (LoadedType != Second->getLoadedType())
	continue;

	// Check if the target supplies paired loads for this type.
	unsigned RequiredAlignment = 0;
	if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
	// move to the next pair, this type is hopeless.
	Second = nullptr;
	continue;
	}
	// Check if we meet the alignment requirement.
	if (RequiredAlignment > First->getAlignment())
	continue;

	// Check that both loads are next to each other in memory.
	if (!areSlicesNextToEachOther(First, Second))
	continue;

	assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
	--GlobalLSCost.Loads;
	// Move to the next pair.
	Second = nullptr;
	}
	}

	/// \brief Check the profitability of all involved LoadedSlice.
	/// Currently, it is considered profitable if there is exactly two
	/// involved slices (1) which are (2) next to each other in memory, and
	/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
	///
	/// Note: The order of the elements in \p LoadedSlices may be modified, but not
	/// the elements themselves.
	///
	/// FIXME: When the cost model will be mature enough, we can relax
	/// constraints (1) and (2).
	static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
	const APInt &UsedBits, bool ForCodeSize) {
	unsigned NumberOfSlices = LoadedSlices.size();
	if (StressLoadSlicing)
	return NumberOfSlices > 1;

	// Check (1).
	if (NumberOfSlices != 2)
	return false;

	// Check (2).
	if (!areUsedBitsDense(UsedBits))
	return false;

	// Check (3).
	LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
	// The original code has one big load.
	OrigCost.Loads = 1;
	for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
	const LoadedSlice &LS = LoadedSlices[CurrSlice];
	// Accumulate the cost of all the slices.
	LoadedSlice::Cost SliceCost(LS, ForCodeSize);
	GlobalSlicingCost += SliceCost;

	// Account as cost in the original configuration the gain obtained
	// with the current slices.
	OrigCost.addSliceGain(LS);
	}

	// If the target supports paired load, adjust the cost accordingly.
	adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
	return OrigCost > GlobalSlicingCost;
	}

	/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
	/// operations, split it in the various pieces being extracted.
	///
	/// This sort of thing is introduced by SROA.
	/// This slicing takes care not to insert overlapping loads.
	/// \pre LI is a simple load (i.e., not an atomic or volatile load).
	bool DAGCombiner::SliceUpLoad(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	LoadSDNode *LD = cast<LoadSDNode>(N);
	if (LD->isVolatile() \|\| !ISD::isNormalLoad(LD) \|\|
	!LD->getValueType(0).isInteger())
	return false;

	// Keep track of already used bits to detect overlapping values.
	// In that case, we will just abort the transformation.
	APInt UsedBits(LD->getValueSizeInBits(0), 0);

	SmallVector<LoadedSlice, 4> LoadedSlices;

	// Check if this load is used as several smaller chunks of bits.
	// Basically, look for uses in trunc or trunc(lshr) and record a new chain
	// of computation for each trunc.
	for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
	UI != UIEnd; ++UI) {
	// Skip the uses of the chain.
	if (UI.getUse().getResNo() != 0)
	continue;

	SDNode User = UI;
	unsigned Shift = 0;

	// Check if this is a trunc(lshr).
	if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
	isa<ConstantSDNode>(User->getOperand(1))) {
	Shift = User->getConstantOperandVal(1);
	User = *User->use_begin();
	}

	// At this point, User is a Truncate, iff we encountered, trunc or
	// trunc(lshr).
	if (User->getOpcode() != ISD::TRUNCATE)
	return false;

	// The width of the type must be a power of 2 and greater than 8-bits.
	// Otherwise the load cannot be represented in LLVM IR.
	// Moreover, if we shifted with a non-8-bits multiple, the slice
	// will be across several bytes. We do not support that.
	unsigned Width = User->getValueSizeInBits(0);
	if (Width < 8 \|\| !isPowerOf2_32(Width) \|\| (Shift & 0x7))
	return false;

	// Build the slice for this chain of computations.
	LoadedSlice LS(User, LD, Shift, &DAG);
	APInt CurrentUsedBits = LS.getUsedBits();

	// Check if this slice overlaps with another.
	if ((CurrentUsedBits & UsedBits) != 0)
	return false;
	// Update the bits used globally.
	UsedBits \|= CurrentUsedBits;

	// Check if the new slice would be legal.
	if (!LS.isLegal())
	return false;

	// Record the slice.
	LoadedSlices.push_back(LS);
	}

	// Abort slicing if it does not seem to be profitable.
	if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
	return false;

	++SlicedLoads;

	// Rewrite each chain to use an independent load.
	// By construction, each chain can be represented by a unique load.

	// Prepare the argument for the new token factor for all the slices.
	SmallVector<SDValue, 8> ArgChains;
	for (SmallVectorImpl<LoadedSlice>::const_iterator
	LSIt = LoadedSlices.begin(),
	LSItEnd = LoadedSlices.end();
	LSIt != LSItEnd; ++LSIt) {
	SDValue SliceInst = LSIt->loadSlice();
	CombineTo(LSIt->Inst, SliceInst, true);
	if (SliceInst.getOpcode() != ISD::LOAD)
	SliceInst = SliceInst.getOperand(0);
	assert(SliceInst->getOpcode() == ISD::LOAD &&
	"It takes more than a zext to get to the loaded slice!!");
	ArgChains.push_back(SliceInst.getValue(1));
	}

	SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
	ArgChains);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
	AddToWorklist(Chain.getNode());
	return true;
	}

	/// Check to see if V is (and load (ptr), imm), where the load is having
	/// specific bytes cleared out. If so, return the byte size being masked out
	/// and the shift amount.
	static std::pair<unsigned, unsigned>
	CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
	std::pair<unsigned, unsigned> Result(0, 0);

	// Check for the structure we're looking for.
	if (V->getOpcode() != ISD::AND \|\|
	!isa<ConstantSDNode>(V->getOperand(1)) \|\|
	!ISD::isNormalLoad(V->getOperand(0).getNode()))
	return Result;

	// Check the chain and pointer.
	LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
	if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer.

	// The store should be chained directly to the load or be an operand of a
	// tokenfactor.
	if (LD == Chain.getNode())
	; // ok.
	else if (Chain->getOpcode() != ISD::TokenFactor)
	return Result; // Fail.
	else {
	bool isOk = false;
	for (const SDValue &ChainOp : Chain->op_values())
	if (ChainOp.getNode() == LD) {
	isOk = true;
	break;
	}
	if (!isOk) return Result;
	}

	// This only handles simple types.
	if (V.getValueType() != MVT::i16 &&
	V.getValueType() != MVT::i32 &&
	V.getValueType() != MVT::i64)
	return Result;

	// Check the constant mask. Invert it so that the bits being masked out are
	// 0 and the bits being kept are 1. Use getSExtValue so that leading bits
	// follow the sign bit for uniformity.
	uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
	unsigned NotMaskLZ = countLeadingZeros(NotMask);
	if (NotMaskLZ & 7) return Result; // Must be multiple of a byte.
	unsigned NotMaskTZ = countTrailingZeros(NotMask);
	if (NotMaskTZ & 7) return Result; // Must be multiple of a byte.
	if (NotMaskLZ == 64) return Result; // All zero mask.

	// See if we have a continuous run of bits. If so, we have 01+0
	if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
	return Result;

	// Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
	if (V.getValueType() != MVT::i64 && NotMaskLZ)
	NotMaskLZ -= 64-V.getValueSizeInBits();

	unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
	switch (MaskedBytes) {
	case 1:
	case 2:
	case 4: break;
	default: return Result; // All one mask, or 5-byte mask.
	}

	// Verify that the first bit starts at a multiple of mask so that the access
	// is aligned the same as the access width.
	if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;

	Result.first = MaskedBytes;
	Result.second = NotMaskTZ/8;
	return Result;
	}

	/// Check to see if IVal is something that provides a value as specified by
	/// MaskInfo. If so, replace the specified store with a narrower store of
	/// truncated IVal.
	static SDNode *
	ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
	SDValue IVal, StoreSDNode *St,
	DAGCombiner *DC) {
	unsigned NumBytes = MaskInfo.first;
	unsigned ByteShift = MaskInfo.second;
	SelectionDAG &DAG = DC->getDAG();

	// Check to see if IVal is all zeros in the part being masked in by the 'or'
	// that uses this. If not, this is not a replacement.
	APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
	ByteShift8, (ByteShift+NumBytes)8);
	if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;

	// Check that it is legal on the target to do this. It is legal if the new
	// VT we're shrinking to (i8/i16/i32) is legal or we're still before type
	// legalization.
	MVT VT = MVT::getIntegerVT(NumBytes*8);
	if (!DC->isTypeLegal(VT))
	return nullptr;

	// Okay, we can do this! Replace the 'St' store with a store of IVal that is
	// shifted by ByteShift and truncated down to NumBytes.
	if (ByteShift) {
	SDLoc DL(IVal);
	IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
	DAG.getConstant(ByteShift*8, DL,
	DC->getShiftAmountTy(IVal.getValueType())));
	}

	// Figure out the offset for the store and the alignment of the access.
	unsigned StOffset;
	unsigned NewAlign = St->getAlignment();

	if (DAG.getDataLayout().isLittleEndian())
	StOffset = ByteShift;
	else
	StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;

	SDValue Ptr = St->getBasePtr();
	if (StOffset) {
	SDLoc DL(IVal);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(),
	Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType()));
	NewAlign = MinAlign(NewAlign, StOffset);
	}

	// Truncate down to the new size.
	IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);

	++OpsNarrowed;
	return DAG
	.getStore(St->getChain(), SDLoc(St), IVal, Ptr,
	St->getPointerInfo().getWithOffset(StOffset), NewAlign)
	.getNode();
	}

	/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
	/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
	/// narrowing the load and store if it would end up being a win for performance
	/// or code size.
	SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	if (ST->isVolatile())
	return SDValue();

	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	SDValue Ptr = ST->getBasePtr();
	EVT VT = Value.getValueType();

	if (ST->isTruncatingStore() \|\| VT.isVector() \|\| !Value.hasOneUse())
	return SDValue();

	unsigned Opc = Value.getOpcode();

	// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
	// is a byte mask indicating a consecutive number of bytes, check to see if
	// Y is known to provide just those bytes. If so, we try to replace the
	// load + replace + store sequence with a single (narrower) store, which makes
	// the load dead.
	if (Opc == ISD::OR) {
	std::pair<unsigned, unsigned> MaskedLoad;
	MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
	if (MaskedLoad.first)
	if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
	Value.getOperand(1), ST,this))
	return SDValue(NewST, 0);

	// Or is commutative, so try swapping X and Y.
	MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
	if (MaskedLoad.first)
	if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
	Value.getOperand(0), ST,this))
	return SDValue(NewST, 0);
	}

	if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) \|\|
	Value.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N0 = Value.getOperand(0);
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	Chain == SDValue(N0.getNode(), 1)) {
	LoadSDNode *LD = cast<LoadSDNode>(N0);
	if (LD->getBasePtr() != Ptr \|\|
	LD->getPointerInfo().getAddrSpace() !=
	ST->getPointerInfo().getAddrSpace())
	return SDValue();

	// Find the type to narrow it the load / op / store to.
	SDValue N1 = Value.getOperand(1);
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (Opc == ISD::AND)
	Imm ^= APInt::getAllOnesValue(BitWidth);
	if (Imm == 0 \|\| Imm.isAllOnesValue())
	return SDValue();
	unsigned ShAmt = Imm.countTrailingZeros();
	unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
	unsigned NewBW = NextPowerOf2(MSB - ShAmt);
	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
	// The narrowing should be profitable, the load/store operation should be
	// legal (or custom) and the store size should be equal to the NewVT width.
	while (NewBW < BitWidth &&
	(NewVT.getStoreSizeInBits() != NewBW \|\|
	!TLI.isOperationLegalOrCustom(Opc, NewVT) \|\|
	!TLI.isNarrowingProfitable(VT, NewVT))) {
	NewBW = NextPowerOf2(NewBW);
	NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
	}
	if (NewBW >= BitWidth)
	return SDValue();

	// If the lsb changed does not start at the type bitwidth boundary,
	// start at the previous one.
	if (ShAmt % NewBW)
	ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
	APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
	std::min(BitWidth, ShAmt + NewBW));
	if ((Imm & Mask) == Imm) {
	APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
	if (Opc == ISD::AND)
	NewImm ^= APInt::getAllOnesValue(NewBW);
	uint64_t PtrOff = ShAmt / 8;
	// For big endian targets, we need to adjust the offset to the pointer to
	// load the correct bytes.
	if (DAG.getDataLayout().isBigEndian())
	PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;

	unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
	Type NewVTTy = NewVT.getTypeForEVT(DAG.getContext());
	if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))
	return SDValue();

	SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD),
	Ptr.getValueType(), Ptr,
	DAG.getConstant(PtrOff, SDLoc(LD),
	Ptr.getValueType()));
	SDValue NewLD =
	DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
	LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
	DAG.getConstant(NewImm, SDLoc(Value),
	NewVT));
	SDValue NewST =
	DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
	ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);

	AddToWorklist(NewPtr.getNode());
	AddToWorklist(NewLD.getNode());
	AddToWorklist(NewVal.getNode());
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
	++OpsNarrowed;
	return NewST;
	}
	}

	return SDValue();
	}

	/// For a given floating point load / store pair, if the load value isn't used
	/// by any other operations, then consider transforming the pair to integer
	/// load / store operations if the target deems the transformation profitable.
	SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
	Value.hasOneUse() &&
	Chain == SDValue(Value.getNode(), 1)) {
	LoadSDNode *LD = cast<LoadSDNode>(Value);
	EVT VT = LD->getMemoryVT();
	if (!VT.isFloatingPoint() \|\|
	VT != ST->getMemoryVT() \|\|
	LD->isNonTemporal() \|\|
	ST->isNonTemporal() \|\|
	LD->getPointerInfo().getAddrSpace() != 0 \|\|
	ST->getPointerInfo().getAddrSpace() != 0)
	return SDValue();

	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
	if (!TLI.isOperationLegal(ISD::LOAD, IntVT) \|\|
	!TLI.isOperationLegal(ISD::STORE, IntVT) \|\|
	!TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) \|\|
	!TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
	return SDValue();

	unsigned LDAlign = LD->getAlignment();
	unsigned STAlign = ST->getAlignment();
	Type IntVTTy = IntVT.getTypeForEVT(DAG.getContext());
	unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);
	if (LDAlign < ABIAlign \|\| STAlign < ABIAlign)
	return SDValue();

	SDValue NewLD =
	DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
	LD->getPointerInfo(), LDAlign);

	SDValue NewST =
	DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(),
	ST->getPointerInfo(), STAlign);

	AddToWorklist(NewLD.getNode());
	AddToWorklist(NewST.getNode());
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
	++LdStFP2Int;
	return NewST;
	}

	return SDValue();
	}

	// This is a helper function for visitMUL to check the profitability
	// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
	// MulNode is the original multiply, AddNode is (add x, c1),
	// and ConstNode is c2.
	//
	// If the (add x, c1) has multiple uses, we could increase
	// the number of adds if we make this transformation.
	// It would only be worth doing this if we can remove a
	// multiply in the process. Check for that here.
	// To illustrate:
	// (A + c1) * c3
	// (A + c2) * c3
	// We're checking for cases where we have common "c3 * A" expressions.
	bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
	SDValue &AddNode,
	SDValue &ConstNode) {
	APInt Val;

	// If the add only has one use, this would be OK to do.
	if (AddNode.getNode()->hasOneUse())
	return true;

	// Walk all the users of the constant with which we're multiplying.
	for (SDNode *Use : ConstNode->uses()) {
	if (Use == MulNode) // This use is the one we're on right now. Skip it.
	continue;

	if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
	SDNode *OtherOp;
	SDNode *MulVar = AddNode.getOperand(0).getNode();

	// OtherOp is what we're multiplying against the constant.
	if (Use->getOperand(0) == ConstNode)
	OtherOp = Use->getOperand(1).getNode();
	else
	OtherOp = Use->getOperand(0).getNode();

	// Check to see if multiply is with the same operand of our "add".
	//
	// ConstNode = CONST
	// Use = ConstNode * A <-- visiting Use. OtherOp is A.
	// ...
	// AddNode = (A + c1) <-- MulVar is A.
	// = AddNode * ConstNode <-- current visiting instruction.
	//
	// If we make this transformation, we will have a common
	// multiply (ConstNode * A) that we can save.
	if (OtherOp == MulVar)
	return true;

	// Now check to see if a future expansion will give us a common
	// multiply.
	//
	// ConstNode = CONST
	// AddNode = (A + c1)
	// ... = AddNode * ConstNode <-- current visiting instruction.
	// ...
	// OtherOp = (A + c2)
	// Use = OtherOp * ConstNode <-- visiting Use.
	//
	// If we make this transformation, we will have a common
	// multiply (CONST * A) after we also do the same transformation
	// to the "t2" instruction.
	if (OtherOp->getOpcode() == ISD::ADD &&
	DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
	OtherOp->getOperand(0).getNode() == MulVar)
	return true;
	}
	}

	// Didn't find a case where this would be profitable.
	return false;
	}

	static SDValue peekThroughBitcast(SDValue V) {
	while (V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumStores) {
	SmallVector<SDValue, 8> Chains;
	SmallPtrSet<const SDNode *, 8> Visited;
	SDLoc StoreDL(StoreNodes[0].MemNode);

	for (unsigned i = 0; i < NumStores; ++i) {
	Visited.insert(StoreNodes[i].MemNode);
	}

	// don't include nodes that are children
	for (unsigned i = 0; i < NumStores; ++i) {
	if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
	Chains.push_back(StoreNodes[i].MemNode->getChain());
	}

	assert(Chains.size() > 0 && "Chain should have generated a chain");
	return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
	}

	bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
	SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
	bool IsConstantSrc, bool UseVector, bool UseTrunc) {
	// Make sure we have something to merge.
	if (NumStores < 2)
	return false;

	// The latest Node in the DAG.
	SDLoc DL(StoreNodes[0].MemNode);

	int64_t ElementSizeBits = MemVT.getStoreSizeInBits();
	unsigned SizeInBits = NumStores * ElementSizeBits;
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

	EVT StoreTy;
	if (UseVector) {
	unsigned Elts = NumStores * NumMemElts;
	// Get the type for the merged vector store.
	StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
	} else
	StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);

	SDValue StoredVal;
	if (UseVector) {
	if (IsConstantSrc) {
	SmallVector<SDValue, 8> BuildVector;
	for (unsigned I = 0; I != NumStores; ++I) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
	SDValue Val = St->getValue();
	// If constant is of the wrong type, convert it now.
	if (MemVT != Val.getValueType()) {
	Val = peekThroughBitcast(Val);
	// Deal with constants of wrong size.
	if (ElementSizeBits != Val.getValueSizeInBits()) {
	EVT IntMemVT =
	EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
	if (isa<ConstantFPSDNode>(Val)) {
	// Not clear how to truncate FP values.
	return false;
	} else if (auto *C = dyn_cast<ConstantSDNode>(Val))
	Val = DAG.getConstant(C->getAPIntValue()
	.zextOrTrunc(Val.getValueSizeInBits())
	.zextOrTrunc(ElementSizeBits),
	SDLoc(C), IntMemVT);
	}
	// Make sure correctly size type is the correct type.
	Val = DAG.getBitcast(MemVT, Val);
	}
	BuildVector.push_back(Val);
	}
	StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, StoreTy, BuildVector);
	} else {
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i < NumStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue Val = peekThroughBitcast(St->getValue());
	// All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
	// type MemVT. If the underlying value is not the correct
	// type, but it is an extraction of an appropriate vector we
	// can recast Val to be of the correct type. This may require
	// converting between EXTRACT_VECTOR_ELT and
	// EXTRACT_SUBVECTOR.
	if ((MemVT != Val.getValueType()) &&
	(Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
	SDValue Vec = Val.getOperand(0);
	EVT MemVTScalarTy = MemVT.getScalarType();
	// We may need to add a bitcast here to get types to line up.
	if (MemVTScalarTy != Vec.getValueType()) {
	unsigned Elts = Vec.getValueType().getSizeInBits() /
	MemVTScalarTy.getSizeInBits();
	EVT NewVecTy =
	EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts);
	Vec = DAG.getBitcast(NewVecTy, Vec);
	}
	auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR
	: ISD::EXTRACT_VECTOR_ELT;
	Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Val.getOperand(1));
	}
	Ops.push_back(Val);
	}

	// Build the extracted vector elements back into a vector.
	StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, StoreTy, Ops);
	}
	} else {
	// We should always use a vector store when merging extracted vector
	// elements, so this path implies a store of constants.
	assert(IsConstantSrc && "Merged vector elements should use vector store");

	APInt StoreInt(SizeInBits, 0);

	// Construct a single integer constant which is made of the smaller
	// constant inputs.
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	for (unsigned i = 0; i < NumStores; ++i) {
	unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);

	SDValue Val = St->getValue();
	StoreInt <<= ElementSizeBits;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
	StoreInt \|= C->getAPIntValue()
	.zextOrTrunc(ElementSizeBits)
	.zextOrTrunc(SizeInBits);
	} else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
	StoreInt \|= C->getValueAPF()
	.bitcastToAPInt()
	.zextOrTrunc(ElementSizeBits)
	.zextOrTrunc(SizeInBits);
	// If fp truncation is necessary give up for now.
	if (MemVT.getSizeInBits() != ElementSizeBits)
	return false;
	} else {
	llvm_unreachable("Invalid constant element type");
	}
	}

	// Create the new Load and Store operations.
	StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
	}

	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);

	// make sure we use trunc store if it's necessary to be legal.
	SDValue NewStore;
	if (!UseTrunc) {
	NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(),
	FirstInChain->getAlignment());
	} else { // Must be realized as a trunc store
	EVT LegalizedStoredValueTy =
	TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
	unsigned LegalizedStoreSize = LegalizedStoredValueTy.getSizeInBits();
	ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
	SDValue ExtendedStoreVal =
	DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
	LegalizedStoredValueTy);
	NewStore = DAG.getTruncStore(
	NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), StoredVal.getValueType() /TVT/,
	FirstInChain->getAlignment(),
	FirstInChain->getMemOperand()->getFlags());
	}

	// Replace all merged stores with the new store.
	for (unsigned i = 0; i < NumStores; ++i)
	CombineTo(StoreNodes[i].MemNode, NewStore);

	AddToWorklist(NewChain.getNode());
	return true;
	}

	void DAGCombiner::getStoreMergeCandidates(
	StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
	// This holds the base pointer, index, and the offset in bytes from the base
	// pointer.
	- BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
	+ BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
	EVT MemVT = St->getMemoryVT();

	SDValue Val = peekThroughBitcast(St->getValue());
	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return;

	bool IsConstantSrc = isa<ConstantSDNode>(Val) \|\| isa<ConstantFPSDNode>(Val);
	bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Val.getOpcode() == ISD::EXTRACT_SUBVECTOR);
	bool IsLoadSrc = isa<LoadSDNode>(Val);
	BaseIndexOffset LBasePtr;
	// Match on loadbaseptr if relevant.
	EVT LoadVT;
	if (IsLoadSrc) {
	auto *Ld = cast<LoadSDNode>(Val);
	- LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
	+ LBasePtr = BaseIndexOffset::match(Ld, DAG);
	LoadVT = Ld->getMemoryVT();
	// Load and store should be the same type.
	if (MemVT != LoadVT)
	return;
	}
	auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
	int64_t &Offset) -> bool {
	if (Other->isVolatile() \|\| Other->isIndexed())
	return false;
	SDValue Val = peekThroughBitcast(Other->getValue());
	// Allow merging constants of different types as integers.
	bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
	: Other->getMemoryVT() != MemVT;
	if (IsLoadSrc) {
	if (NoTypeMatch)
	return false;
	// The Load's Base Ptr must also match
	if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {
	- auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
	+ auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
	if (LoadVT != OtherLd->getMemoryVT())
	return false;
	if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
	return false;
	} else
	return false;
	}
	if (IsConstantSrc) {
	if (NoTypeMatch)
	return false;
	if (!(isa<ConstantSDNode>(Val) \|\| isa<ConstantFPSDNode>(Val)))
	return false;
	}
	if (IsExtractVecSrc) {
	// Do not merge truncated stores here.
	if (Other->isTruncatingStore())
	return false;
	if (!MemVT.bitsEq(Val.getValueType()))
	return false;
	if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
	Val.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	}
	- Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
	+ Ptr = BaseIndexOffset::match(Other, DAG);
	return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
	};

	// We looking for a root node which is an ancestor to all mergable
	// stores. We search up through a load, to our root and then down
	// through all children. For instance we will find Store{1,2,3} if
	// St is Store1, Store2. or Store3 where the root is not a load
	// which always true for nonvolatile ops. TODO: Expand
	// the search to find all valid candidates through multiple layers of loads.
	//
	// Root
	// \|-------\|-------\|
	// Load Load Store3
	// \| \|
	// Store1 Store2
	//
	// FIXME: We should be able to climb and
	// descend TokenFactors to find candidates as well.

	SDNode *RootNode = (St->getChain()).getNode();

	if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
	RootNode = Ldn->getChain().getNode();
	for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
	if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
	for (auto I2 = (I)->use_begin(), E2 = (I)->use_end(); I2 != E2; ++I2)
	if (I2.getOperandNo() == 0)
	if (StoreSDNode OtherST = dyn_cast<StoreSDNode>(I2)) {
	BaseIndexOffset Ptr;
	int64_t PtrDiff;
	if (CandidateMatch(OtherST, Ptr, PtrDiff))
	StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
	}
	} else
	for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
	if (I.getOperandNo() == 0)
	if (StoreSDNode OtherST = dyn_cast<StoreSDNode>(I)) {
	BaseIndexOffset Ptr;
	int64_t PtrDiff;
	if (CandidateMatch(OtherST, Ptr, PtrDiff))
	StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
	}
	}

	// We need to check that merging these stores does not cause a loop in
	// the DAG. Any store candidate may depend on another candidate
	// indirectly through its operand (we already consider dependencies
	// through the chain). Check in parallel by searching up from
	// non-chain operands of candidates.
	bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
	// FIXME: We should be able to truncate a full search of
	// predecessors by doing a BFS and keeping tabs the originating
	// stores from which worklist nodes come from in a similar way to
	// TokenFactor simplfication.

	SmallPtrSet<const SDNode *, 16> Visited;
	SmallVector<const SDNode *, 8> Worklist;
	unsigned int Max = 8192;
	// Search Ops of store candidates.
	for (unsigned i = 0; i < NumStores; ++i) {
	SDNode *n = StoreNodes[i].MemNode;
	// Potential loops may happen only through non-chain operands
	for (unsigned j = 1; j < n->getNumOperands(); ++j)
	Worklist.push_back(n->getOperand(j).getNode());
	}
	// Search through DAG. We can stop early if we find a store node.
	for (unsigned i = 0; i < NumStores; ++i) {
	if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
	Max))
	return false;
	// Check if we ended early, failing conservatively if so.
	if (Visited.size() >= Max)
	return false;
	}
	return true;
	}

	bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
	if (OptLevel == CodeGenOpt::None)
	return false;

	EVT MemVT = St->getMemoryVT();
	int64_t ElementSizeBytes = MemVT.getStoreSize();
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

	if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
	return false;

	bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	// This function cannot currently deal with non-byte-sized memory sizes.
	if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
	return false;

	if (!MemVT.isSimple())
	return false;

	// Perform an early exit check. Do not bother looking at stored values that
	// are not constants, loads, or extracted vector elements.
	SDValue StoredVal = peekThroughBitcast(St->getValue());
	bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
	bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) \|\|
	isa<ConstantFPSDNode>(StoredVal);
	bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);

	if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
	return false;

	SmallVector<MemOpLink, 8> StoreNodes;
	// Find potential store merge candidates by searching through chain sub-DAG
	getStoreMergeCandidates(St, StoreNodes);

	// Check if there is anything to merge.
	if (StoreNodes.size() < 2)
	return false;

	// Sort the memory operands according to their distance from the
	// base pointer.
	std::sort(StoreNodes.begin(), StoreNodes.end(),
	[](MemOpLink LHS, MemOpLink RHS) {
	return LHS.OffsetFromBase < RHS.OffsetFromBase;
	});

	// Store Merge attempts to merge the lowest stores. This generally
	// works out as if successful, as the remaining stores are checked
	// after the first collection of stores is merged. However, in the
	// case that a non-mergeable store is found first, e.g., {p[-2],
	// p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
	// mergeable cases. To prevent this, we prune such stores from the
	// front of StoreNodes here.

	bool RV = false;
	while (StoreNodes.size() > 1) {
	unsigned StartIdx = 0;
	while ((StartIdx + 1 < StoreNodes.size()) &&
	StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
	StoreNodes[StartIdx + 1].OffsetFromBase)
	++StartIdx;

	// Bail if we don't have enough candidates to merge.
	if (StartIdx + 1 >= StoreNodes.size())
	return RV;

	if (StartIdx)
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);

	// Scan the memory operations on the chain and find the first
	// non-consecutive store memory address.
	unsigned NumConsecutiveStores = 1;
	int64_t StartAddress = StoreNodes[0].OffsetFromBase;
	// Check that the addresses are consecutive starting from the second
	// element in the list of stores.
	for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
	int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
	if (CurrAddress - StartAddress != (ElementSizeBytes * i))
	break;
	NumConsecutiveStores = i + 1;
	}

	if (NumConsecutiveStores < 2) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumConsecutiveStores);
	continue;
	}

	// Check that we can merge these candidates without causing a cycle
	if (!checkMergeStoreCandidatesForDependencies(StoreNodes,
	NumConsecutiveStores)) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumConsecutiveStores);
	continue;
	}

	// The node with the lowest store address.
	LLVMContext &Context = *DAG.getContext();
	const DataLayout &DL = DAG.getDataLayout();

	// Store the constants into memory as one consecutive store.
	if (IsConstantSrc) {
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	unsigned LastLegalType = 1;
	unsigned LastLegalVectorType = 1;
	bool LastIntegerTrunc = false;
	bool NonZero = false;
	unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue StoredVal = ST->getValue();
	bool IsElementZero = false;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
	IsElementZero = C->isNullValue();
	else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
	IsElementZero = C->getConstantFPValue()->isNullValue();
	if (IsElementZero) {
	if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
	FirstZeroAfterNonZero = i;
	}
	NonZero \|= !IsElementZero;

	// Find a legal type for the constant store.
	unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
	EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
	bool IsFast = false;
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast) {
	LastIntegerTrunc = false;
	LastLegalType = i + 1;
	// Or check whether a truncstore is legal.
	} else if (TLI.getTypeAction(Context, StoreTy) ==
	TargetLowering::TypePromoteInteger) {
	EVT LegalizedStoredValueTy =
	TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
	if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast) {
	LastIntegerTrunc = true;
	LastLegalType = i + 1;
	}
	}

	// We only use vectors if the constant is known to be zero or the target
	// allows it and the function is not marked with the noimplicitfloat
	// attribute.
	if ((!NonZero \|\|
	TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
	!NoVectors) {
	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
	if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
	TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast)
	LastLegalVectorType = i + 1;
	}
	}

	bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
	unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;

	// Check if we found a legal integer type that creates a meaningful merge.
	if (NumElem < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have, is if the alignment has
	// improved or we've dropped a non-zero value. Drop as many
	// candidates as we can here.
	unsigned NumSkip = 1;
	while (
	(NumSkip < NumConsecutiveStores) &&
	(NumSkip < FirstZeroAfterNonZero) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) {
	NumSkip++;
	}
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	continue;
	}

	bool Merged = MergeStoresOfConstantsOrVecElts(
	StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
	RV \|= Merged;

	// Remove merged stores for next iteration.
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	continue;
	}

	// When extracting multiple vector elements, try to store them
	// in one vector store rather than a sequence of scalar stores.
	if (IsExtractVecSrc) {
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	unsigned NumStoresToMerge = 1;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue StVal = peekThroughBitcast(St->getValue());
	// This restriction could be loosened.
	// Bail out if any stored values are not elements extracted from a
	// vector. It should be possible to handle mixed sources, but load
	// sources need more careful handling (see the block of code below that
	// handles consecutive loads).
	if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
	StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return RV;

	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT Ty =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
	bool IsFast;
	if (TLI.isTypeLegal(Ty) &&
	TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast)
	NumStoresToMerge = i + 1;
	}

	// Check if we found a legal integer type that creates a meaningful merge.
	if (NumStoresToMerge < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have, is if the alignment has
	// improved. Drop as many candidates as we can here.
	unsigned NumSkip = 1;
	while ((NumSkip < NumConsecutiveStores) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;

	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	continue;
	}

	bool Merged = MergeStoresOfConstantsOrVecElts(
	StoreNodes, MemVT, NumStoresToMerge, false, true, false);
	if (!Merged) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumStoresToMerge);
	continue;
	}
	// Remove merged stores for next iteration.
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumStoresToMerge);
	RV = true;
	continue;
	}

	// Below we handle the case of multiple consecutive stores that
	// come from multiple consecutive loads. We merge them into a single
	// wide load and a single wide store.

	// Look for load nodes which are used by the stored values.
	SmallVector<MemOpLink, 8> LoadNodes;

	// Find acceptable loads. Loads need to have the same chain (token factor),
	// must not be zext, volatile, indexed, and they must be consecutive.
	BaseIndexOffset LdBasePtr;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue Val = peekThroughBitcast(St->getValue());
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val);
	if (!Ld)
	break;

	// Loads must only have one use.
	if (!Ld->hasNUsesOfValue(1, 0))
	break;

	// The memory operands must not be volatile.
	if (Ld->isVolatile() \|\| Ld->isIndexed())
	break;

	// The stored memory type must be the same.
	if (Ld->getMemoryVT() != MemVT)
	break;

	- BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
	+ BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
	// If this is not the first ptr that we check.
	int64_t LdOffset = 0;
	if (LdBasePtr.getBase().getNode()) {
	// The base ptr must be the same.
	if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
	break;
	} else {
	// Check that all other base pointers are the same as this one.
	LdBasePtr = LdPtr;
	}

	// We found a potential memory operand to merge.
	LoadNodes.push_back(MemOpLink(Ld, LdOffset));
	}

	if (LoadNodes.size() < 2) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
	continue;
	}

	// If we have load/store pair instructions and we only have two values,
	// don't bother merging.
	unsigned RequiredAlignment;
	if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
	StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
	continue;
	}
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
	unsigned FirstLoadAS = FirstLoad->getAddressSpace();
	unsigned FirstLoadAlign = FirstLoad->getAlignment();

	// Scan the memory operations on the chain and find the first
	// non-consecutive load memory address. These variables hold the index in
	// the store node array.
	unsigned LastConsecutiveLoad = 1;
	// This variable refers to the size and not index in the array.
	unsigned LastLegalVectorType = 1;
	unsigned LastLegalIntegerType = 1;
	bool isDereferenceable = true;
	bool DoIntegerTruncate = false;
	StartAddress = LoadNodes[0].OffsetFromBase;
	SDValue FirstChain = FirstLoad->getChain();
	for (unsigned i = 1; i < LoadNodes.size(); ++i) {
	// All loads must share the same chain.
	if (LoadNodes[i].MemNode->getChain() != FirstChain)
	break;

	int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
	if (CurrAddress - StartAddress != (ElementSizeBytes * i))
	break;
	LastConsecutiveLoad = i;

	if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
	isDereferenceable = false;

	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

	bool IsFastSt, IsFastLd;
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
	FirstLoadAlign, &IsFastLd) &&
	IsFastLd) {
	LastLegalVectorType = i + 1;
	}

	// Find a legal type for the integer store.
	unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
	StoreTy = EVT::getIntegerVT(Context, SizeInBits);
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
	FirstLoadAlign, &IsFastLd) &&
	IsFastLd) {
	LastLegalIntegerType = i + 1;
	DoIntegerTruncate = false;
	// Or check whether a truncstore and extload is legal.
	} else if (TLI.getTypeAction(Context, StoreTy) ==
	TargetLowering::TypePromoteInteger) {
	EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
	if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy,
	StoreTy) &&
	TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
	StoreTy) &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
	FirstLoadAlign, &IsFastLd) &&
	IsFastLd) {
	LastLegalIntegerType = i + 1;
	DoIntegerTruncate = true;
	}
	}
	}

	// Only use vector types if the vector type is larger than the integer type.
	// If they are the same, use integers.
	bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
	unsigned LastLegalType =
	std::max(LastLegalVectorType, LastLegalIntegerType);

	// We add +1 here because the LastXXX variables refer to location while
	// the NumElem refers to array/index size.
	unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
	NumElem = std::min(LastLegalType, NumElem);

	if (NumElem < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have is if the alignment or either
	// the load or store has improved. Drop as many candidates as we
	// can here.
	unsigned NumSkip = 1;
	while ((NumSkip < LoadNodes.size()) &&
	(LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	continue;
	}

	// Find if it is better to use vectors or integers to load and store
	// to memory.
	EVT JointMemOpVT;
	if (UseVectorTy) {
	// Find a legal type for the vector store.
	unsigned Elts = NumElem * NumMemElts;
	JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
	} else {
	unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
	JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
	}

	SDLoc LoadDL(LoadNodes[0].MemNode);
	SDLoc StoreDL(StoreNodes[0].MemNode);

	// The merged loads are required to have the same incoming chain, so
	// using the first's chain is acceptable.

	SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
	AddToWorklist(NewStoreChain.getNode());

	MachineMemOperand::Flags MMOFlags = isDereferenceable ?
	MachineMemOperand::MODereferenceable:
	MachineMemOperand::MONone;

	SDValue NewLoad, NewStore;
	if (UseVectorTy \|\| !DoIntegerTruncate) {
	NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
	FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), FirstLoadAlign,
	MMOFlags);
	NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad,
	FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), FirstStoreAlign);
	} else { // This must be the truncstore/extload case
	EVT ExtendedTy =
	TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
	NewLoad =
	DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(),
	FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
	JointMemOpVT, FirstLoadAlign, MMOFlags);
	NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
	FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), JointMemOpVT,
	FirstInChain->getAlignment(),
	FirstInChain->getMemOperand()->getFlags());
	}

	// Transfer chain users from old loads to the new load.
	for (unsigned i = 0; i < NumElem; ++i) {
	LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
	SDValue(NewLoad.getNode(), 1));
	}

	// Replace the all stores with the new store. Recursively remove
	// corresponding value if its no longer used.
	for (unsigned i = 0; i < NumElem; ++i) {
	SDValue Val = StoreNodes[i].MemNode->getOperand(1);
	CombineTo(StoreNodes[i].MemNode, NewStore);
	if (Val.getNode()->use_empty())
	recursivelyDeleteUnusedNodes(Val.getNode());
	}

	RV = true;
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	}
	return RV;
	}

	SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
	SDLoc SL(ST);
	SDValue ReplStore;

	// Replace the chain to avoid dependency.
	if (ST->isTruncatingStore()) {
	ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
	ST->getBasePtr(), ST->getMemoryVT(),
	ST->getMemOperand());
	} else {
	ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
	ST->getMemOperand());
	}

	// Create token to keep both nodes around.
	SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
	MVT::Other, ST->getChain(), ReplStore);

	// Make sure the new and old chains are cleaned up.
	AddToWorklist(Token.getNode());

	// Don't add users to work list.
	return CombineTo(ST, Token, false);
	}

	SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
	SDValue Value = ST->getValue();
	if (Value.getOpcode() == ISD::TargetConstantFP)
	return SDValue();

	SDLoc DL(ST);

	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();

	const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);

	// NOTE: If the original store is volatile, this transform must not increase
	// the number of stores. For example, on x86-32 an f64 can be stored in one
	// processor operation but an i64 (which is not legal) requires two. So the
	// transform should not be done in this case.

	SDValue Tmp;
	switch (CFP->getSimpleValueType(0).SimpleTy) {
	default:
	llvm_unreachable("Unknown FP type");
	case MVT::f16: // We don't do this for these yet.
	case MVT::f80:
	case MVT::f128:
	case MVT::ppcf128:
	return SDValue();
	case MVT::f32:
	if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
	;
	Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
	bitcastToAPInt().getZExtValue(), SDLoc(CFP),
	MVT::i32);
	return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
	}

	return SDValue();
	case MVT::f64:
	if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
	!ST->isVolatile()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
	;
	Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
	getZExtValue(), SDLoc(CFP), MVT::i64);
	return DAG.getStore(Chain, DL, Tmp,
	Ptr, ST->getMemOperand());
	}

	if (!ST->isVolatile() &&
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
	// Many FP stores are not made apparent until after legalize, e.g. for
	// argument passing. Since this is so common, custom legalize the
	// 64-bit integer store into two 32-bit stores.
	uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
	SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
	SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
	ST->getAlignment(), MMOFlags, AAInfo);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(4, DL, Ptr.getValueType()));
	Alignment = MinAlign(Alignment, 4U);
	SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(4),
	Alignment, MMOFlags, AAInfo);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
	St0, St1);
	}

	return SDValue();
	}
	}

	SDValue DAGCombiner::visitSTORE(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	SDValue Ptr = ST->getBasePtr();

	// If this is a store of a bit convert, store the input value if the
	// resultant store does not need a higher alignment than the original.
	if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
	ST->isUnindexed()) {
	EVT SVT = Value.getOperand(0).getValueType();
	if (((!LegalOperations && !ST->isVolatile()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) &&
	TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
	unsigned OrigAlign = ST->getAlignment();
	bool Fast = false;
	if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
	ST->getAddressSpace(), OrigAlign, &Fast) &&
	Fast) {
	return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
	ST->getPointerInfo(), OrigAlign,
	ST->getMemOperand()->getFlags(), ST->getAAInfo());
	}
	}
	}

	// Turn 'store undef, Ptr' -> nothing.
	if (Value.isUndef() && ST->isUnindexed())
	return Chain;

	// Try to infer better alignment information than the store already has.
	if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
	if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
	if (Align > ST->getAlignment()) {
	SDValue NewStore =
	DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
	ST->getMemoryVT(), Align,
	ST->getMemOperand()->getFlags(), ST->getAAInfo());
	if (NewStore.getNode() != N)
	return CombineTo(ST, NewStore, true);
	}
	}
	}

	// Try transforming a pair floating point load / store ops to integer
	// load / store ops.
	if (SDValue NewST = TransformFPLoadStorePair(N))
	return NewST;

	if (ST->isUnindexed()) {
	// Walk up chain skipping non-aliasing memory nodes, on this store and any
	// adjacent stores.
	if (findBetterNeighborChains(ST)) {
	// replaceStoreChain uses CombineTo, which handled all of the worklist
	// manipulation. Return the original node to not do anything else.
	return SDValue(ST, 0);
	}
	Chain = ST->getChain();
	}

	// FIXME: is there such a thing as a truncating indexed store?
	if (ST->isTruncatingStore() && ST->isUnindexed() &&
	Value.getValueType().isInteger()) {
	// See if we can simplify the input to this truncstore with knowledge that
	// only the low bits are being used. For example:
	// "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
	SDValue Shorter = DAG.GetDemandedBits(
	Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
	ST->getMemoryVT().getScalarSizeInBits()));
	AddToWorklist(Value.getNode());
	if (Shorter.getNode())
	return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
	Ptr, ST->getMemoryVT(), ST->getMemOperand());

	// Otherwise, see if we can simplify the operation with
	// SimplifyDemandedBits, which only works if the value has a single use.
	if (SimplifyDemandedBits(
	Value,
	APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
	ST->getMemoryVT().getScalarSizeInBits()))) {
	// Re-visit the store if anything changed and the store hasn't been merged
	// with another node (N is deleted) SimplifyDemandedBits will add Value's
	// node back to the worklist if necessary, but we also need to re-visit
	// the Store node itself.
	if (N->getOpcode() != ISD::DELETED_NODE)
	AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// If this is a load followed by a store to the same location, then the store
	// is dead/noop.
	if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
	if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
	ST->isUnindexed() && !ST->isVolatile() &&
	// There can't be any side effects between the load and store, such as
	// a call or store.
	Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
	// The store is dead, remove it.
	return Chain;
	}
	}

	if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
	if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
	!ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
	ST->getMemoryVT() == ST1->getMemoryVT()) {
	// If this is a store followed by a store with the same value to the same
	// location, then the store is dead/noop.
	if (ST1->getValue() == Value) {
	// The store is dead, remove it.
	return Chain;
	}

	// If this is a store who's preceeding store to the same location
	// and no one other node is chained to that store we can effectively
	// drop the store. Do not remove stores to undef as they may be used as
	// data sinks.
	if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
	!ST1->getBasePtr().isUndef()) {
	// ST1 is fully overwritten and can be elided. Combine with it's chain
	// value.
	CombineTo(ST1, ST1->getChain());
	return SDValue();
	}
	}
	}

	// If this is an FP_ROUND or TRUNC followed by a store, fold this into a
	// truncating store. We can do this even if this is already a truncstore.
	if ((Value.getOpcode() == ISD::FP_ROUND \|\| Value.getOpcode() == ISD::TRUNCATE)
	&& Value.getNode()->hasOneUse() && ST->isUnindexed() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	ST->getMemoryVT())) {
	return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
	Ptr, ST->getMemoryVT(), ST->getMemOperand());
	}

	// Always perform this optimization before types are legal. If the target
	// prefers, also try this after legalization to catch stores that were created
	// by intrinsics or other nodes.
	if (!LegalTypes \|\| (TLI.mergeStoresAfterLegalization())) {
	while (true) {
	// There can be multiple store sequences on the same chain.
	// Keep trying to merge store sequences until we are unable to do so
	// or until we merge the last store on the chain.
	bool Changed = MergeConsecutiveStores(ST);
	if (!Changed) break;
	// Return N as merge only uses CombineTo and no worklist clean
	// up is necessary.
	if (N->getOpcode() == ISD::DELETED_NODE \|\| !isa<StoreSDNode>(N))
	return SDValue(N, 0);
	}
	}

	// Try transforming N to an indexed store.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	// Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
	//
	// Make sure to do this only after attempting to merge stores in order to
	// avoid changing the types of some subset of stores due to visit order,
	// preventing their merging.
	if (isa<ConstantFPSDNode>(ST->getValue())) {
	if (SDValue NewSt = replaceStoreOfFPConstant(ST))
	return NewSt;
	}

	if (SDValue NewSt = splitMergedValStore(ST))
	return NewSt;

	return ReduceLoadOpStoreWidth(N);
	}

	/// For the instruction sequence of store below, F and I values
	/// are bundled together as an i64 value before being stored into memory.
	/// Sometimes it is more efficent to generate separate stores for F and I,
	/// which can remove the bitwise instructions or sink them to colder places.
	///
	/// (store (or (zext (bitcast F to i32) to i64),
	/// (shl (zext I to i64), 32)), addr) -->
	/// (store F, addr) and (store I, addr+4)
	///
	/// Similarly, splitting for other merged store can also be beneficial, like:
	/// For pair of {i32, i32}, i64 store --> two i32 stores.
	/// For pair of {i32, i16}, i64 store --> two i32 stores.
	/// For pair of {i16, i16}, i32 store --> two i16 stores.
	/// For pair of {i16, i8}, i32 store --> two i16 stores.
	/// For pair of {i8, i8}, i16 store --> two i8 stores.
	///
	/// We allow each target to determine specifically which kind of splitting is
	/// supported.
	///
	/// The store patterns are commonly seen from the simple code snippet below
	/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
	/// void goo(const std::pair<int, float> &);
	/// hoo() {
	/// ...
	/// goo(std::make_pair(tmp, ftmp));
	/// ...
	/// }
	///
	SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
	if (OptLevel == CodeGenOpt::None)
	return SDValue();

	SDValue Val = ST->getValue();
	SDLoc DL(ST);

	// Match OR operand.
	if (!Val.getValueType().isScalarInteger() \|\| Val.getOpcode() != ISD::OR)
	return SDValue();

	// Match SHL operand and get Lower and Higher parts of Val.
	SDValue Op1 = Val.getOperand(0);
	SDValue Op2 = Val.getOperand(1);
	SDValue Lo, Hi;
	if (Op1.getOpcode() != ISD::SHL) {
	std::swap(Op1, Op2);
	if (Op1.getOpcode() != ISD::SHL)
	return SDValue();
	}
	Lo = Op2;
	Hi = Op1.getOperand(0);
	if (!Op1.hasOneUse())
	return SDValue();

	// Match shift amount to HalfValBitSize.
	unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
	ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
	if (!ShAmt \|\| ShAmt->getAPIntValue() != HalfValBitSize)
	return SDValue();

	// Lo and Hi are zero-extended from int with size less equal than 32
	// to i64.
	if (Lo.getOpcode() != ISD::ZERO_EXTEND \|\| !Lo.hasOneUse() \|\|
	!Lo.getOperand(0).getValueType().isScalarInteger() \|\|
	Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize \|\|
	Hi.getOpcode() != ISD::ZERO_EXTEND \|\| !Hi.hasOneUse() \|\|
	!Hi.getOperand(0).getValueType().isScalarInteger() \|\|
	Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
	return SDValue();

	// Use the EVT of low and high parts before bitcast as the input
	// of target query.
	EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
	? Lo.getOperand(0).getValueType()
	: Lo.getValueType();
	EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
	? Hi.getOperand(0).getValueType()
	: Hi.getValueType();
	if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
	return SDValue();

	// Start to split store.
	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	// Change the sizes of Lo and Hi's value types to HalfValBitSize.
	EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
	Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));

	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	// Lower value store.
	SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
	ST->getAlignment(), MMOFlags, AAInfo);
	Ptr =
	DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
	// Higher value store.
	SDValue St1 =
	DAG.getStore(St0, DL, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
	Alignment / 2, MMOFlags, AAInfo);
	return St1;
	}

	/// Convert a disguised subvector insertion into a shuffle:
	/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
	/// bitcast(shuffle (bitcast V), (extended X), Mask)
	/// Note: We do not use an insert_subvector node because that requires a legal
	/// subvector type.
	SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
	SDValue InsertVal = N->getOperand(1);
	if (InsertVal.getOpcode() != ISD::BITCAST \|\| !InsertVal.hasOneUse() \|\|
	!InsertVal.getOperand(0).getValueType().isVector())
	return SDValue();

	SDValue SubVec = InsertVal.getOperand(0);
	SDValue DestVec = N->getOperand(0);
	EVT SubVecVT = SubVec.getValueType();
	EVT VT = DestVec.getValueType();
	unsigned NumSrcElts = SubVecVT.getVectorNumElements();
	unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
	unsigned NumMaskVals = ExtendRatio * NumSrcElts;

	// Step 1: Create a shuffle mask that implements this insert operation. The
	// vector that we are inserting into will be operand 0 of the shuffle, so
	// those elements are just 'i'. The inserted subvector is in the first
	// positions of operand 1 of the shuffle. Example:
	// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
	SmallVector<int, 16> Mask(NumMaskVals);
	for (unsigned i = 0; i != NumMaskVals; ++i) {
	if (i / NumSrcElts == InsIndex)
	Mask[i] = (i % NumSrcElts) + NumMaskVals;
	else
	Mask[i] = i;
	}

	// Bail out if the target can not handle the shuffle we want to create.
	EVT SubVecEltVT = SubVecVT.getVectorElementType();
	EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
	if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
	return SDValue();

	// Step 2: Create a wide vector from the inserted source vector by appending
	// undefined elements. This is the same size as our destination vector.
	SDLoc DL(N);
	SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
	ConcatOps[0] = SubVec;
	SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);

	// Step 3: Shuffle in the padded subvector.
	SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
	SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
	AddToWorklist(PaddedSubV.getNode());
	AddToWorklist(DestVecBC.getNode());
	AddToWorklist(Shuf.getNode());
	return DAG.getBitcast(VT, Shuf);
	}

	SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
	SDValue InVec = N->getOperand(0);
	SDValue InVal = N->getOperand(1);
	SDValue EltNo = N->getOperand(2);
	SDLoc DL(N);

	// If the inserted element is an UNDEF, just use the input vector.
	if (InVal.isUndef())
	return InVec;

	EVT VT = InVec.getValueType();

	// Remove redundant insertions:
	// (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
	if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
	return InVec;

	// We must know which element is being inserted for folds below here.
	auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
	if (!IndexC)
	return SDValue();
	unsigned Elt = IndexC->getZExtValue();

	if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
	return Shuf;

	// Canonicalize insert_vector_elt dag nodes.
	// Example:
	// (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
	// -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
	//
	// Do this only if the child insert_vector node has one use; also
	// do this only if indices are both constants and Idx1 < Idx0.
	if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
	&& isa<ConstantSDNode>(InVec.getOperand(2))) {
	unsigned OtherElt = InVec.getConstantOperandVal(2);
	if (Elt < OtherElt) {
	// Swap nodes.
	SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	InVec.getOperand(0), InVal, EltNo);
	AddToWorklist(NewOp.getNode());
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
	VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
	}
	}

	// If we can't generate a legal BUILD_VECTOR, exit
	if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
	return SDValue();

	// Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
	// be converted to a BUILD_VECTOR). Fill in the Ops vector with the
	// vector elements.
	SmallVector<SDValue, 8> Ops;
	// Do not combine these two vectors if the output vector will not replace
	// the input vector.
	if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
	Ops.append(InVec.getNode()->op_begin(),
	InVec.getNode()->op_end());
	} else if (InVec.isUndef()) {
	unsigned NElts = VT.getVectorNumElements();
	Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
	} else {
	return SDValue();
	}

	// Insert the element
	if (Elt < Ops.size()) {
	// All the operands of BUILD_VECTOR must have the same type;
	// we enforce that here.
	EVT OpVT = Ops[0].getValueType();
	Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
	}

	// Return the new vector
	return DAG.getBuildVector(VT, DL, Ops);
	}

	SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
	SDNode EVE, EVT InVecVT, SDValue EltNo, LoadSDNode OriginalLoad) {
	assert(!OriginalLoad->isVolatile());

	EVT ResultVT = EVE->getValueType(0);
	EVT VecEltVT = InVecVT.getVectorElementType();
	unsigned Align = OriginalLoad->getAlignment();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	VecEltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
	return SDValue();

	ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
	ISD::NON_EXTLOAD : ISD::EXTLOAD;
	if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
	return SDValue();

	Align = NewAlign;

	SDValue NewPtr = OriginalLoad->getBasePtr();
	SDValue Offset;
	EVT PtrType = NewPtr.getValueType();
	MachinePointerInfo MPI;
	SDLoc DL(EVE);
	if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
	int Elt = ConstEltNo->getZExtValue();
	unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
	Offset = DAG.getConstant(PtrOff, DL, PtrType);
	MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
	} else {
	Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType);
	Offset = DAG.getNode(
	ISD::MUL, DL, PtrType, Offset,
	DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
	MPI = OriginalLoad->getPointerInfo();
	}
	NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset);

	// The replacement we need to do here is a little tricky: we need to
	// replace an extractelement of a load with a load.
	// Use ReplaceAllUsesOfValuesWith to do the replacement.
	// Note that this replacement assumes that the extractvalue is the only
	// use of the load; that's okay because we don't want to perform this
	// transformation in other cases anyway.
	SDValue Load;
	SDValue Chain;
	if (ResultVT.bitsGT(VecEltVT)) {
	// If the result type of vextract is wider than the load, then issue an
	// extending load instead.
	ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
	VecEltVT)
	? ISD::ZEXTLOAD
	: ISD::EXTLOAD;
	Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
	OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
	Align, OriginalLoad->getMemOperand()->getFlags(),
	OriginalLoad->getAAInfo());
	Chain = Load.getValue(1);
	} else {
	Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,
	MPI, Align, OriginalLoad->getMemOperand()->getFlags(),
	OriginalLoad->getAAInfo());
	Chain = Load.getValue(1);
	if (ResultVT.bitsLT(VecEltVT))
	Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
	else
	Load = DAG.getBitcast(ResultVT, Load);
	}
	WorklistRemover DeadNodes(*this);
	SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
	SDValue To[] = { Load, Chain };
	DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
	// Since we're explicitly calling ReplaceAllUses, add the new node to the
	// worklist explicitly as well.
	AddToWorklist(Load.getNode());
	AddUsersToWorklist(Load.getNode()); // Add users too
	// Make sure to revisit this node to clean it up; it will usually be dead.
	AddToWorklist(EVE);
	++OpsNarrowed;
	return SDValue(EVE, 0);
	}

	SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
	// (vextract (scalar_to_vector val, 0) -> val
	SDValue InVec = N->getOperand(0);
	EVT VT = InVec.getValueType();
	EVT NVT = N->getValueType(0);

	if (InVec.isUndef())
	return DAG.getUNDEF(NVT);

	if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	// Check if the result type doesn't match the inserted element type. A
	// SCALAR_TO_VECTOR may truncate the inserted element and the
	// EXTRACT_VECTOR_ELT may widen the extracted vector.
	SDValue InOp = InVec.getOperand(0);
	if (InOp.getValueType() != NVT) {
	assert(InOp.getValueType().isInteger() && NVT.isInteger());
	return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT);
	}
	return InOp;
	}

	SDValue EltNo = N->getOperand(1);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);

	// extract_vector_elt (build_vector x, y), 1 -> y
	if (ConstEltNo &&
	InVec.getOpcode() == ISD::BUILD_VECTOR &&
	TLI.isTypeLegal(VT) &&
	(InVec.hasOneUse() \|\|
	TLI.aggressivelyPreferBuildVectorSources(VT))) {
	SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
	EVT InEltVT = Elt.getValueType();

	// Sometimes build_vector's scalar input types do not match result type.
	if (NVT == InEltVT)
	return Elt;

	// TODO: It may be useful to truncate if free if the build_vector implicitly
	// converts.
	}

	// extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
	bool isLE = DAG.getDataLayout().isLittleEndian();
	unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
	if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
	ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
	SDValue BCSrc = InVec.getOperand(0);
	if (BCSrc.getValueType().isScalarInteger())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
	}

	// extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
	//
	// This only really matters if the index is non-constant since other combines
	// on the constant elements already work.
	if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
	EltNo == InVec.getOperand(2)) {
	SDValue Elt = InVec.getOperand(1);
	return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt;
	}

	// Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
	// We only perform this optimization before the op legalization phase because
	// we may introduce new vector instructions which are not backed by TD
	// patterns. For example on AVX, extracting elements from a wide vector
	// without using extract_subvector. However, if we can find an underlying
	// scalar value, then we can always use that.
	if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
	int NumElem = VT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
	// Find the new index to extract from.
	int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());

	// Extracting an undef index is undef.
	if (OrigElt == -1)
	return DAG.getUNDEF(NVT);

	// Select the right vector half to extract from.
	SDValue SVInVec;
	if (OrigElt < NumElem) {
	SVInVec = InVec->getOperand(0);
	} else {
	SVInVec = InVec->getOperand(1);
	OrigElt -= NumElem;
	}

	if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
	SDValue InOp = SVInVec.getOperand(OrigElt);
	if (InOp.getValueType() != NVT) {
	assert(InOp.getValueType().isInteger() && NVT.isInteger());
	InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT);
	}

	return InOp;
	}

	// FIXME: We should handle recursing on other vector shuffles and
	// scalar_to_vector here as well.

	if (!LegalOperations \|\|
	// FIXME: Should really be just isOperationLegalOrCustom.
	TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) \|\|
	TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) {
	EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
	DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
	}
	}

	bool BCNumEltsChanged = false;
	EVT ExtVT = VT.getVectorElementType();
	EVT LVT = ExtVT;

	// If the result of load has to be truncated, then it's not necessarily
	// profitable.
	if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
	return SDValue();

	if (InVec.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	EVT BCVT = InVec.getOperand(0).getValueType();
	if (!BCVT.isVector() \|\| ExtVT.bitsGT(BCVT.getVectorElementType()))
	return SDValue();
	if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
	BCNumEltsChanged = true;
	InVec = InVec.getOperand(0);
	ExtVT = BCVT.getVectorElementType();
	}

	// (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
	if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
	ISD::isNormalLoad(InVec.getNode()) &&
	!N->getOperand(1)->hasPredecessor(InVec.getNode())) {
	SDValue Index = N->getOperand(1);
	if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec)) {
	if (!OrigLoad->isVolatile()) {
	return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
	OrigLoad);
	}
	}
	}

	// Perform only after legalization to ensure build_vector / vector_shuffle
	// optimizations have already been done.
	if (!LegalOperations) return SDValue();

	// (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
	// (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
	// (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)

	if (ConstEltNo) {
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();

	LoadSDNode *LN0 = nullptr;
	const ShuffleVectorSDNode *SVN = nullptr;
	if (ISD::isNormalLoad(InVec.getNode())) {
	LN0 = cast<LoadSDNode>(InVec);
	} else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	InVec.getOperand(0).getValueType() == ExtVT &&
	ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	LN0 = cast<LoadSDNode>(InVec.getOperand(0));
	} else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
	// (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
	// =>
	// (load $addr+1*size)

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	// If the bit convert changed the number of elements, it is unsafe
	// to examine the mask.
	if (BCNumEltsChanged)
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = VT.getVectorNumElements();
	int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt);
	InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);

	if (InVec.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	InVec = InVec.getOperand(0);
	}
	if (ISD::isNormalLoad(InVec.getNode())) {
	LN0 = cast<LoadSDNode>(InVec);
	Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
	EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType());
	}
	}

	// Make sure we found a non-volatile load and the extractelement is
	// the only use.
	if (!LN0 \|\| !LN0->hasNUsesOfValue(1,0) \|\| LN0->isVolatile())
	return SDValue();

	// If Idx was -1 above, Elt is going to be -1, so just return undef.
	if (Elt == -1)
	return DAG.getUNDEF(LVT);

	return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
	}

	return SDValue();
	}

	// Simplify (build_vec (ext )) to (bitcast (build_vec ))
	SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
	// We perform this optimization post type-legalization because
	// the type-legalizer often scalarizes integer-promoted vectors.
	// Performing this optimization before may create bit-casts which
	// will be type-legalized to complex code sequences.
	// We perform this optimization only before the operation legalizer because we
	// may introduce illegal operations.
	if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
	return SDValue();

	unsigned NumInScalars = N->getNumOperands();
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// Check to see if this is a BUILD_VECTOR of a bunch of values
	// which come from any_extend or zero_extend nodes. If so, we can create
	// a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
	// optimizations. We do not handle sign-extend because we can't fill the sign
	// using shuffles.
	EVT SourceType = MVT::Other;
	bool AllAnyExt = true;

	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);
	// Ignore undef inputs.
	if (In.isUndef()) continue;

	bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND;
	bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;

	// Abort if the element is not an extension.
	if (!ZeroExt && !AnyExt) {
	SourceType = MVT::Other;
	break;
	}

	// The input is a ZeroExt or AnyExt. Check the original type.
	EVT InTy = In.getOperand(0).getValueType();

	// Check that all of the widened source types are the same.
	if (SourceType == MVT::Other)
	// First time.
	SourceType = InTy;
	else if (InTy != SourceType) {
	// Multiple income types. Abort.
	SourceType = MVT::Other;
	break;
	}

	// Check if all of the extends are ANY_EXTENDs.
	AllAnyExt &= AnyExt;
	}

	// In order to have valid types, all of the inputs must be extended from the
	// same source type and all of the inputs must be any or zero extend.
	// Scalar sizes must be a power of two.
	EVT OutScalarTy = VT.getScalarType();
	bool ValidTypes = SourceType != MVT::Other &&
	isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
	isPowerOf2_32(SourceType.getSizeInBits());

	// Create a new simpler BUILD_VECTOR sequence which other optimizations can
	// turn into a single shuffle instruction.
	if (!ValidTypes)
	return SDValue();

	bool isLE = DAG.getDataLayout().isLittleEndian();
	unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
	assert(ElemRatio > 1 && "Invalid element size ratio");
	SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
	DAG.getConstant(0, DL, SourceType);

	unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
	SmallVector<SDValue, 8> Ops(NewBVElems, Filler);

	// Populate the new build_vector
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDValue Cast = N->getOperand(i);
	assert((Cast.getOpcode() == ISD::ANY_EXTEND \|\|
	Cast.getOpcode() == ISD::ZERO_EXTEND \|\|
	Cast.isUndef()) && "Invalid cast opcode");
	SDValue In;
	if (Cast.isUndef())
	In = DAG.getUNDEF(SourceType);
	else
	In = Cast->getOperand(0);
	unsigned Index = isLE ? (i * ElemRatio) :
	(i * ElemRatio + (ElemRatio - 1));

	assert(Index < Ops.size() && "Invalid index");
	Ops[Index] = In;
	}

	// The type of the new BUILD_VECTOR node.
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
	assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
	"Invalid vector size");
	// Check if the new vector type is legal.
	if (!isTypeLegal(VecVT)) return SDValue();

	// Make the new BUILD_VECTOR.
	SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);

	// The new BUILD_VECTOR node has the potential to be further optimized.
	AddToWorklist(BV.getNode());
	// Bitcast to the desired type.
	return DAG.getBitcast(VT, BV);
	}

	SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
	EVT VT = N->getValueType(0);

	unsigned NumInScalars = N->getNumOperands();
	SDLoc DL(N);

	EVT SrcVT = MVT::Other;
	unsigned Opcode = ISD::DELETED_NODE;
	unsigned NumDefs = 0;

	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);
	unsigned Opc = In.getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	// If all scalar values are floats and converted from integers.
	if (Opcode == ISD::DELETED_NODE &&
	(Opc == ISD::UINT_TO_FP \|\| Opc == ISD::SINT_TO_FP)) {
	Opcode = Opc;
	}

	if (Opc != Opcode)
	return SDValue();

	EVT InVT = In.getOperand(0).getValueType();

	// If all scalar values are typed differently, bail out. It's chosen to
	// simplify BUILD_VECTOR of integer types.
	if (SrcVT == MVT::Other)
	SrcVT = InVT;
	if (SrcVT != InVT)
	return SDValue();
	NumDefs++;
	}

	// If the vector has just one element defined, it's not worth to fold it into
	// a vectorized one.
	if (NumDefs < 2)
	return SDValue();

	assert((Opcode == ISD::UINT_TO_FP \|\| Opcode == ISD::SINT_TO_FP)
	&& "Should only handle conversion from integer to float.");
	assert(SrcVT != MVT::Other && "Cannot determine source type!");

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);

	if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
	return SDValue();

	// Just because the floating-point vector type is legal does not necessarily
	// mean that the corresponding integer vector type is.
	if (!isTypeLegal(NVT))
	return SDValue();

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);

	if (In.isUndef())
	Opnds.push_back(DAG.getUNDEF(SrcVT));
	else
	Opnds.push_back(In.getOperand(0));
	}
	SDValue BV = DAG.getBuildVector(NVT, DL, Opnds);
	AddToWorklist(BV.getNode());

	return DAG.getNode(Opcode, DL, VT, BV);
	}

	SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
	ArrayRef<int> VectorMask,
	SDValue VecIn1, SDValue VecIn2,
	unsigned LeftIdx) {
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);

	EVT VT = N->getValueType(0);
	EVT InVT1 = VecIn1.getValueType();
	EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;

	unsigned Vec2Offset = 0;
	unsigned NumElems = VT.getVectorNumElements();
	unsigned ShuffleNumElems = NumElems;

	// In case both the input vectors are extracted from same base
	// vector we do not need extra addend (Vec2Offset) while
	// computing shuffle mask.
	if (!VecIn2 \|\| !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) \|\|
	!(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) \|\|
	!(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
	Vec2Offset = InVT1.getVectorNumElements();

	// We can't generate a shuffle node with mismatched input and output types.
	// Try to make the types match the type of the output.
	if (InVT1 != VT \|\| InVT2 != VT) {
	if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) {
	// If the output vector length is a multiple of both input lengths,
	// we can concatenate them and pad the rest with undefs.
	unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits();
	assert(NumConcats >= 2 && "Concat needs at least two inputs!");
	SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
	ConcatOps[0] = VecIn1;
	ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
	VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	VecIn2 = SDValue();
	} else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
	if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
	return SDValue();

	if (!VecIn2.getNode()) {
	// If we only have one input vector, and it's twice the size of the
	// output, split it in two.
	VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
	DAG.getConstant(NumElems, DL, IdxTy));
	VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
	// Since we now have shorter input vectors, adjust the offset of the
	// second vector's start.
	Vec2Offset = NumElems;
	} else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
	// VecIn1 is wider than the output, and we have another, possibly
	// smaller input. Pad the smaller input with undefs, shuffle at the
	// input vector width, and extract the output.
	// The shuffle type is different than VT, so check legality again.
	if (LegalOperations &&
	!TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
	return SDValue();

	// Legalizing INSERT_SUBVECTOR is tricky - you basically have to
	// lower it back into a BUILD_VECTOR. So if the inserted type is
	// illegal, don't even try.
	if (InVT1 != InVT2) {
	if (!TLI.isTypeLegal(InVT2))
	return SDValue();
	VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
	DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
	}
	ShuffleNumElems = NumElems * 2;
	} else {
	// Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
	// than VecIn1. We can't handle this for now - this case will disappear
	// when we start sorting the vectors by type.
	return SDValue();
	}
	} else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
	InVT1.getSizeInBits() == VT.getSizeInBits()) {
	SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
	ConcatOps[0] = VecIn2;
	VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	} else {
	// TODO: Support cases where the length mismatch isn't exactly by a
	// factor of 2.
	// TODO: Move this check upwards, so that if we have bad type
	// mismatches, we don't create any DAG nodes.
	return SDValue();
	}
	}

	// Initialize mask to undef.
	SmallVector<int, 8> Mask(ShuffleNumElems, -1);

	// Only need to run up to the number of elements actually used, not the
	// total number of elements in the shuffle - if we are shuffling a wider
	// vector, the high lanes should be set to undef.
	for (unsigned i = 0; i != NumElems; ++i) {
	if (VectorMask[i] <= 0)
	continue;

	unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
	if (VectorMask[i] == (int)LeftIdx) {
	Mask[i] = ExtIndex;
	} else if (VectorMask[i] == (int)LeftIdx + 1) {
	Mask[i] = Vec2Offset + ExtIndex;
	}
	}

	// The type the input vectors may have changed above.
	InVT1 = VecIn1.getValueType();

	// If we already have a VecIn2, it should have the same type as VecIn1.
	// If we don't, get an undef/zero vector of the appropriate type.
	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
	assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");

	SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
	if (ShuffleNumElems > NumElems)
	Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);

	return Shuffle;
	}

	// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
	// operations. If the types of the vectors we're extracting from allow it,
	// turn this into a vector_shuffle node.
	SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
	if (!isTypeLegal(VT))
	return SDValue();

	// May only combine to shuffle after legalize if shuffle is legal.
	if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
	return SDValue();

	bool UsesZeroVector = false;
	unsigned NumElems = N->getNumOperands();

	// Record, for each element of the newly built vector, which input vector
	// that element comes from. -1 stands for undef, 0 for the zero vector,
	// and positive values for the input vectors.
	// VectorMask maps each element to its vector number, and VecIn maps vector
	// numbers to their initial SDValues.

	SmallVector<int, 8> VectorMask(NumElems, -1);
	SmallVector<SDValue, 8> VecIn;
	VecIn.push_back(SDValue());

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.isUndef())
	continue;

	// See if we can use a blend with a zero vector.
	// TODO: Should we generalize this to a blend with an arbitrary constant
	// vector?
	if (isNullConstant(Op) \|\| isNullFPConstant(Op)) {
	UsesZeroVector = true;
	VectorMask[i] = 0;
	continue;
	}

	// Not an undef or zero. If the input is something other than an
	// EXTRACT_VECTOR_ELT with a constant index, bail out.
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	SDValue ExtractedFromVec = Op.getOperand(0);

	// All inputs must have the same element type as the output.
	if (VT.getVectorElementType() !=
	ExtractedFromVec.getValueType().getVectorElementType())
	return SDValue();

	// Have we seen this input vector before?
	// The vectors are expected to be tiny (usually 1 or 2 elements), so using
	// a map back from SDValues to numbers isn't worth it.
	unsigned Idx = std::distance(
	VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec));
	if (Idx == VecIn.size())
	VecIn.push_back(ExtractedFromVec);

	VectorMask[i] = Idx;
	}

	// If we didn't find at least one input vector, bail out.
	if (VecIn.size() < 2)
	return SDValue();

	// If all the Operands of BUILD_VECTOR extract from same
	// vector, then split the vector efficiently based on the maximum
	// vector access index and adjust the VectorMask and
	// VecIn accordingly.
	if (VecIn.size() == 2) {
	unsigned MaxIndex = 0;
	unsigned NearestPow2 = 0;
	SDValue Vec = VecIn.back();
	EVT InVT = Vec.getValueType();
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SmallVector<unsigned, 8> IndexVec(NumElems, 0);

	for (unsigned i = 0; i < NumElems; i++) {
	if (VectorMask[i] <= 0)
	continue;
	unsigned Index = N->getOperand(i).getConstantOperandVal(1);
	IndexVec[i] = Index;
	MaxIndex = std::max(MaxIndex, Index);
	}

	NearestPow2 = PowerOf2Ceil(MaxIndex);
	if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
	NumElems * 2 < NearestPow2) {
	unsigned SplitSize = NearestPow2 / 2;
	EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), SplitSize);
	if (TLI.isTypeLegal(SplitVT)) {
	SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
	DAG.getConstant(SplitSize, DL, IdxTy));
	SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
	DAG.getConstant(0, DL, IdxTy));
	VecIn.pop_back();
	VecIn.push_back(VecIn1);
	VecIn.push_back(VecIn2);

	for (unsigned i = 0; i < NumElems; i++) {
	if (VectorMask[i] <= 0)
	continue;
	VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
	}
	}
	}
	}

	// TODO: We want to sort the vectors by descending length, so that adjacent
	// pairs have similar length, and the longer vector is always first in the
	// pair.

	// TODO: Should this fire if some of the input vectors has illegal type (like
	// it does now), or should we let legalization run its course first?

	// Shuffle phase:
	// Take pairs of vectors, and shuffle them so that the result has elements
	// from these vectors in the correct places.
	// For example, given:
	// t10: i32 = extract_vector_elt t1, Constant:i64<0>
	// t11: i32 = extract_vector_elt t2, Constant:i64<0>
	// t12: i32 = extract_vector_elt t3, Constant:i64<0>
	// t13: i32 = extract_vector_elt t1, Constant:i64<1>
	// t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
	// We will generate:
	// t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
	// t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
	SmallVector<SDValue, 4> Shuffles;
	for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
	unsigned LeftIdx = 2 * In + 1;
	SDValue VecLeft = VecIn[LeftIdx];
	SDValue VecRight =
	(LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();

	if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
	VecRight, LeftIdx))
	Shuffles.push_back(Shuffle);
	else
	return SDValue();
	}

	// If we need the zero vector as an "ingredient" in the blend tree, add it
	// to the list of shuffles.
	if (UsesZeroVector)
	Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT));

	// If we only have one shuffle, we're done.
	if (Shuffles.size() == 1)
	return Shuffles[0];

	// Update the vector mask to point to the post-shuffle vectors.
	for (int &Vec : VectorMask)
	if (Vec == 0)
	Vec = Shuffles.size() - 1;
	else
	Vec = (Vec - 1) / 2;

	// More than one shuffle. Generate a binary tree of blends, e.g. if from
	// the previous step we got the set of shuffles t10, t11, t12, t13, we will
	// generate:
	// t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
	// t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
	// t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
	// t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
	// t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
	// t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
	// t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21

	// Make sure the initial size of the shuffle list is even.
	if (Shuffles.size() % 2)
	Shuffles.push_back(DAG.getUNDEF(VT));

	for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
	if (CurSize % 2) {
	Shuffles[CurSize] = DAG.getUNDEF(VT);
	CurSize++;
	}
	for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
	int Left = 2 * In;
	int Right = 2 * In + 1;
	SmallVector<int, 8> Mask(NumElems, -1);
	for (unsigned i = 0; i != NumElems; ++i) {
	if (VectorMask[i] == Left) {
	Mask[i] = i;
	VectorMask[i] = In;
	} else if (VectorMask[i] == Right) {
	Mask[i] = i + NumElems;
	VectorMask[i] = In;
	}
	}

	Shuffles[In] =
	DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
	}
	}
	return Shuffles[0];
	}

	SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);

	// A vector built entirely of undefs is undef.
	if (ISD::allOperandsUndef(N))
	return DAG.getUNDEF(VT);

	// Check if we can express BUILD VECTOR via subvector extract.
	if (!LegalTypes && (N->getNumOperands() > 1)) {
	SDValue Op0 = N->getOperand(0);
	auto checkElem = [&](SDValue Op) -> uint64_t {
	if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
	(Op0.getOperand(0) == Op.getOperand(0)))
	if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return CNode->getZExtValue();
	return -1;
	};

	int Offset = checkElem(Op0);
	for (unsigned i = 0; i < N->getNumOperands(); ++i) {
	if (Offset + i != checkElem(N->getOperand(i))) {
	Offset = -1;
	break;
	}
	}

	if ((Offset == 0) &&
	(Op0.getOperand(0).getValueType() == N->getValueType(0)))
	return Op0.getOperand(0);
	if ((Offset != -1) &&
	((Offset % N->getValueType(0).getVectorNumElements()) ==
	0)) // IDX must be multiple of output size.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
	Op0.getOperand(0), Op0.getOperand(1));
	}

	if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
	return V;

	if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
	return V;

	if (SDValue V = reduceBuildVecToShuffle(N))
	return V;

	return SDValue();
	}

	static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT OpVT = N->getOperand(0).getValueType();

	// If the operands are legal vectors, leave them alone.
	if (TLI.isTypeLegal(OpVT))
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SmallVector<SDValue, 8> Ops;

	EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
	SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);

	// Keep track of what we encounter.
	bool AnyInteger = false;
	bool AnyFP = false;
	for (const SDValue &Op : N->ops()) {
	if (ISD::BITCAST == Op.getOpcode() &&
	!Op.getOperand(0).getValueType().isVector())
	Ops.push_back(Op.getOperand(0));
	else if (ISD::UNDEF == Op.getOpcode())
	Ops.push_back(ScalarUndef);
	else
	return SDValue();

	// Note whether we encounter an integer or floating point scalar.
	// If it's neither, bail out, it could be something weird like x86mmx.
	EVT LastOpVT = Ops.back().getValueType();
	if (LastOpVT.isFloatingPoint())
	AnyFP = true;
	else if (LastOpVT.isInteger())
	AnyInteger = true;
	else
	return SDValue();
	}

	// If any of the operands is a floating point scalar bitcast to a vector,
	// use floating point types throughout, and bitcast everything.
	// Replace UNDEFs by another scalar UNDEF node, of the final desired type.
	if (AnyFP) {
	SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
	ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
	if (AnyInteger) {
	for (SDValue &Op : Ops) {
	if (Op.getValueType() == SVT)
	continue;
	if (Op.isUndef())
	Op = ScalarUndef;
	else
	Op = DAG.getBitcast(SVT, Op);
	}
	}
	}

	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
	VT.getSizeInBits() / SVT.getSizeInBits());
	return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
	}

	// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
	// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
	// most two distinct vectors the same size as the result, attempt to turn this
	// into a legal shuffle.
	static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	EVT OpVT = N->getOperand(0).getValueType();
	int NumElts = VT.getVectorNumElements();
	int NumOpElts = OpVT.getVectorNumElements();

	SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
	SmallVector<int, 8> Mask;

	for (SDValue Op : N->ops()) {
	// Peek through any bitcast.
	Op = peekThroughBitcast(Op);

	// UNDEF nodes convert to UNDEF shuffle mask values.
	if (Op.isUndef()) {
	Mask.append((unsigned)NumOpElts, -1);
	continue;
	}

	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return SDValue();

	// What vector are we extracting the subvector from and at what index?
	SDValue ExtVec = Op.getOperand(0);

	// We want the EVT of the original extraction to correctly scale the
	// extraction index.
	EVT ExtVT = ExtVec.getValueType();

	// Peek through any bitcast.
	ExtVec = peekThroughBitcast(ExtVec);

	// UNDEF nodes convert to UNDEF shuffle mask values.
	if (ExtVec.isUndef()) {
	Mask.append((unsigned)NumOpElts, -1);
	continue;
	}

	if (!isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	int ExtIdx = Op.getConstantOperandVal(1);

	// Ensure that we are extracting a subvector from a vector the same
	// size as the result.
	if (ExtVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// Scale the subvector index to account for any bitcast.
	int NumExtElts = ExtVT.getVectorNumElements();
	if (0 == (NumExtElts % NumElts))
	ExtIdx /= (NumExtElts / NumElts);
	else if (0 == (NumElts % NumExtElts))
	ExtIdx *= (NumElts / NumExtElts);
	else
	return SDValue();

	// At most we can reference 2 inputs in the final shuffle.
	if (SV0.isUndef() \|\| SV0 == ExtVec) {
	SV0 = ExtVec;
	for (int i = 0; i != NumOpElts; ++i)
	Mask.push_back(i + ExtIdx);
	} else if (SV1.isUndef() \|\| SV1 == ExtVec) {
	SV1 = ExtVec;
	for (int i = 0; i != NumOpElts; ++i)
	Mask.push_back(i + ExtIdx + NumElts);
	} else {
	return SDValue();
	}
	}

	if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
	return SDValue();

	return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
	DAG.getBitcast(VT, SV1), Mask);
	}

	SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
	// If we only have one input vector, we don't need to do any concatenation.
	if (N->getNumOperands() == 1)
	return N->getOperand(0);

	// Check if all of the operands are undefs.
	EVT VT = N->getValueType(0);
	if (ISD::allOperandsUndef(N))
	return DAG.getUNDEF(VT);

	// Optimize concat_vectors where all but the first of the vectors are undef.
	if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
	return Op.isUndef();
	})) {
	SDValue In = N->getOperand(0);
	assert(In.getValueType().isVector() && "Must concat vectors");

	// Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
	if (In->getOpcode() == ISD::BITCAST &&
	!In->getOperand(0).getValueType().isVector()) {
	SDValue Scalar = In->getOperand(0);

	// If the bitcast type isn't legal, it might be a trunc of a legal type;
	// look through the trunc so we can still do the transform:
	// concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
	if (Scalar->getOpcode() == ISD::TRUNCATE &&
	!TLI.isTypeLegal(Scalar.getValueType()) &&
	TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
	Scalar = Scalar->getOperand(0);

	EVT SclTy = Scalar->getValueType(0);

	if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
	return SDValue();

	unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
	if (VNTNumElms < 2)
	return SDValue();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
	if (!TLI.isTypeLegal(NVT) \|\| !TLI.isTypeLegal(Scalar.getValueType()))
	return SDValue();

	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
	return DAG.getBitcast(VT, Res);
	}
	}

	// Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
	// We have already tested above for an UNDEF only concatenation.
	// fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
	// -> (BUILD_VECTOR A, B, ..., C, D, ...)
	auto IsBuildVectorOrUndef = [](const SDValue &Op) {
	return ISD::UNDEF == Op.getOpcode() \|\| ISD::BUILD_VECTOR == Op.getOpcode();
	};
	if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
	SmallVector<SDValue, 8> Opnds;
	EVT SVT = VT.getScalarType();

	EVT MinVT = SVT;
	if (!SVT.isFloatingPoint()) {
	// If BUILD_VECTOR are from built from integer, they may have different
	// operand types. Get the smallest type and truncate all operands to it.
	bool FoundMinVT = false;
	for (const SDValue &Op : N->ops())
	if (ISD::BUILD_VECTOR == Op.getOpcode()) {
	EVT OpSVT = Op.getOperand(0).getValueType();
	MinVT = (!FoundMinVT \|\| OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
	FoundMinVT = true;
	}
	assert(FoundMinVT && "Concat vector type mismatch");
	}

	for (const SDValue &Op : N->ops()) {
	EVT OpVT = Op.getValueType();
	unsigned NumElts = OpVT.getVectorNumElements();

	if (ISD::UNDEF == Op.getOpcode())
	Opnds.append(NumElts, DAG.getUNDEF(MinVT));

	if (ISD::BUILD_VECTOR == Op.getOpcode()) {
	if (SVT.isFloatingPoint()) {
	assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
	Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Opnds.push_back(
	DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
	}
	}
	}

	assert(VT.getVectorNumElements() == Opnds.size() &&
	"Concat vector type mismatch");
	return DAG.getBuildVector(VT, SDLoc(N), Opnds);
	}

	// Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
	if (SDValue V = combineConcatVectorOfScalars(N, DAG))
	return V;

	// Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
	if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
	if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
	return V;

	// Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
	// nodes often generate nop CONCAT_VECTOR nodes.
	// Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
	// place the incoming vectors at the exact same location.
	SDValue SingleSource = SDValue();
	unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();

	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.isUndef())
	continue;

	// Check if this is the identity extract:
	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return SDValue();

	// Find the single incoming vector for the extract_subvector.
	if (SingleSource.getNode()) {
	if (Op.getOperand(0) != SingleSource)
	return SDValue();
	} else {
	SingleSource = Op.getOperand(0);

	// Check the source type is the same as the type of the result.
	// If not, this concat may extend the vector, so we can not
	// optimize it away.
	if (SingleSource.getValueType() != N->getValueType(0))
	return SDValue();
	}

	unsigned IdentityIndex = i * PartNumElem;
	ConstantSDNode *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	// The extract index must be constant.
	if (!CS)
	return SDValue();

	// Check that we are reading from the identity index.
	if (CS->getZExtValue() != IdentityIndex)
	return SDValue();
	}

	if (SingleSource.getNode())
	return SingleSource;

	return SDValue();
	}

	/// If we are extracting a subvector produced by a wide binary operator with at
	/// at least one operand that was the result of a vector concatenation, then try
	/// to use the narrow vector operands directly to avoid the concatenation and
	/// extraction.
	static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
	// TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
	// some of these bailouts with other transforms.

	// The extract index must be a constant, so we can map it to a concat operand.
	auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
	if (!ExtractIndex)
	return SDValue();

	// Only handle the case where we are doubling and then halving. A larger ratio
	// may require more than two narrow binops to replace the wide binop.
	EVT VT = Extract->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
	"Extract index is not a multiple of the vector length.");
	if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
	return SDValue();

	// We are looking for an optionally bitcasted wide vector binary operator
	// feeding an extract subvector.
	SDValue BinOp = peekThroughBitcast(Extract->getOperand(0));

	// TODO: The motivating case for this transform is an x86 AVX1 target. That
	// target has temptingly almost legal versions of bitwise logic ops in 256-bit
	// flavors, but no other 256-bit integer support. This could be extended to
	// handle any binop, but that may require fixing/adding other folds to avoid
	// codegen regressions.
	unsigned BOpcode = BinOp.getOpcode();
	if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
	return SDValue();

	// The binop must be a vector type, so we can chop it in half.
	EVT WideBVT = BinOp.getValueType();
	if (!WideBVT.isVector())
	return SDValue();

	// Bail out if the target does not support a narrower version of the binop.
	EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
	WideBVT.getVectorNumElements() / 2);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
	return SDValue();

	// Peek through bitcasts of the binary operator operands if needed.
	SDValue LHS = peekThroughBitcast(BinOp.getOperand(0));
	SDValue RHS = peekThroughBitcast(BinOp.getOperand(1));

	// We need at least one concatenation operation of a binop operand to make
	// this transform worthwhile. The concat must double the input vector sizes.
	// TODO: Should we also handle INSERT_SUBVECTOR patterns?
	bool ConcatL =
	LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
	bool ConcatR =
	RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2;
	if (!ConcatL && !ConcatR)
	return SDValue();

	// If one of the binop operands was not the result of a concat, we must
	// extract a half-sized operand for our new narrow binop. We can't just reuse
	// the original extract index operand because we may have bitcasted.
	unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
	unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
	EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
	SDLoc DL(Extract);

	// extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
	// extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N)
	// extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN
	SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum))
	: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(0),
	DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));

	SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum))
	: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(1),
	DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));

	SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
	return DAG.getBitcast(VT, NarrowBinOp);
	}

	/// If we are extracting a subvector from a wide vector load, convert to a
	/// narrow load to eliminate the extraction:
	/// (extract_subvector (load wide vector)) --> (load narrow vector)
	static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
	// TODO: Add support for big-endian. The offset calculation must be adjusted.
	if (DAG.getDataLayout().isBigEndian())
	return SDValue();

	// TODO: The one-use check is overly conservative. Check the cost of the
	// extract instead or remove that condition entirely.
	auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
	auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
	if (!Ld \|\| !Ld->hasOneUse() \|\| Ld->getExtensionType() \|\| Ld->isVolatile() \|\|
	!ExtIdx)
	return SDValue();

	// The narrow load will be offset from the base address of the old load if
	// we are extracting from something besides index 0 (little-endian).
	EVT VT = Extract->getValueType(0);
	SDLoc DL(Extract);
	SDValue BaseAddr = Ld->getOperand(1);
	unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();

	// TODO: Use "BaseIndexOffset" to make this more effective.
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
	VT.getStoreSize());
	SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return NewLd;
	}

	SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
	EVT NVT = N->getValueType(0);
	SDValue V = N->getOperand(0);

	// Extract from UNDEF is UNDEF.
	if (V.isUndef())
	return DAG.getUNDEF(NVT);

	if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
	if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
	return NarrowLoad;

	// Combine:
	// (extract_subvec (concat V1, V2, ...), i)
	// Into:
	// Vi if possible
	// Only operand 0 is checked as 'concat' assumes all inputs of the same
	// type.
	if (V->getOpcode() == ISD::CONCAT_VECTORS &&
	isa<ConstantSDNode>(N->getOperand(1)) &&
	V->getOperand(0).getValueType() == NVT) {
	unsigned Idx = N->getConstantOperandVal(1);
	unsigned NumElems = NVT.getVectorNumElements();
	assert((Idx % NumElems) == 0 &&
	"IDX in concat is not a multiple of the result vector length.");
	return V->getOperand(Idx / NumElems);
	}

	// Skip bitcasting
	V = peekThroughBitcast(V);

	// If the input is a build vector. Try to make a smaller build vector.
	if (V->getOpcode() == ISD::BUILD_VECTOR) {
	if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
	EVT InVT = V->getValueType(0);
	unsigned ExtractSize = NVT.getSizeInBits();
	unsigned EltSize = InVT.getScalarSizeInBits();
	// Only do this if we won't split any elements.
	if (ExtractSize % EltSize == 0) {
	unsigned NumElems = ExtractSize / EltSize;
	EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), NumElems);
	if ((!LegalOperations \|\|
	TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) &&
	(!LegalTypes \|\| TLI.isTypeLegal(ExtractVT))) {
	unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) /
	EltSize;

	// Extract the pieces from the original build_vector.
	SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
	makeArrayRef(V->op_begin() + IdxVal,
	NumElems));
	return DAG.getBitcast(NVT, BuildVec);
	}
	}
	}
	}

	if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
	// Handle only simple case where vector being inserted and vector
	// being extracted are of same size.
	EVT SmallVT = V->getOperand(1).getValueType();
	if (!NVT.bitsEq(SmallVT))
	return SDValue();

	// Only handle cases where both indexes are constants.
	ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
	ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));

	if (InsIdx && ExtIdx) {
	// Combine:
	// (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
	// Into:
	// indices are equal or bit offsets are equal => V1
	// otherwise => (extract_subvec V1, ExtIdx)
	if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() ==
	ExtIdx->getZExtValue() * NVT.getScalarSizeInBits())
	return DAG.getBitcast(NVT, V->getOperand(1));
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
	DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)),
	N->getOperand(1));
	}
	}

	if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
	return NarrowBOp;

	return SDValue();
	}

	static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
	SDValue V, SelectionDAG &DAG) {
	SDLoc DL(V);
	EVT VT = V.getValueType();

	switch (V.getOpcode()) {
	default:
	return V;

	case ISD::CONCAT_VECTORS: {
	EVT OpVT = V->getOperand(0).getValueType();
	int OpSize = OpVT.getVectorNumElements();
	SmallBitVector OpUsedElements(OpSize, false);
	bool FoundSimplification = false;
	SmallVector<SDValue, 4> NewOps;
	NewOps.reserve(V->getNumOperands());
	for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
	SDValue Op = V->getOperand(i);
	bool OpUsed = false;
	for (int j = 0; j < OpSize; ++j)
	if (UsedElements[i * OpSize + j]) {
	OpUsedElements[j] = true;
	OpUsed = true;
	}
	NewOps.push_back(
	OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
	: DAG.getUNDEF(OpVT));
	FoundSimplification \|= Op == NewOps.back();
	OpUsedElements.reset();
	}
	if (FoundSimplification)
	V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
	return V;
	}

	case ISD::INSERT_SUBVECTOR: {
	SDValue BaseV = V->getOperand(0);
	SDValue SubV = V->getOperand(1);
	auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
	if (!IdxN)
	return V;

	int SubSize = SubV.getValueType().getVectorNumElements();
	int Idx = IdxN->getZExtValue();
	bool SubVectorUsed = false;
	SmallBitVector SubUsedElements(SubSize, false);
	for (int i = 0; i < SubSize; ++i)
	if (UsedElements[i + Idx]) {
	SubVectorUsed = true;
	SubUsedElements[i] = true;
	UsedElements[i + Idx] = false;
	}

	// Now recurse on both the base and sub vectors.
	SDValue SimplifiedSubV =
	SubVectorUsed
	? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
	: DAG.getUNDEF(SubV.getValueType());
	SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
	if (SimplifiedSubV != SubV \|\| SimplifiedBaseV != BaseV)
	V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
	return V;
	}
	}
	}

	static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
	SDValue N1, SelectionDAG &DAG) {
	EVT VT = SVN->getValueType(0);
	int NumElts = VT.getVectorNumElements();
	SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
	for (int M : SVN->getMask())
	if (M >= 0 && M < NumElts)
	N0UsedElements[M] = true;
	else if (M >= NumElts)
	N1UsedElements[M - NumElts] = true;

	SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
	SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
	if (S0 == N0 && S1 == N1)
	return SDValue();

	return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
	}

	static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0,
	SDValue N1, SelectionDAG &DAG) {
	auto isUndefElt = [](SDValue V, int Idx) {
	// TODO - handle more cases as required.
	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Idx).isUndef();
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Idx != 0) \|\| V.getOperand(0).isUndef();
	return false;
	};

	EVT VT = SVN->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	bool Changed = false;
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) \|\|
	((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) {
	Changed = true;
	Idx = -1;
	}
	NewMask.push_back(Idx);
	}
	if (Changed)
	return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask);

	return SDValue();
	}

	// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
	// or turn a shuffle of a single concat into simpler shuffle then concat.
	static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);

	SmallVector<SDValue, 4> Ops;
	EVT ConcatVT = N0.getOperand(0).getValueType();
	unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
	unsigned NumConcats = NumElts / NumElemsPerConcat;

	// Special case: shuffle(concat(A,B)) can be more efficiently represented
	// as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
	// half vector elements.
	if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
	std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
	SVN->getMask().end(), [](int i) { return i == -1; })) {
	N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
	makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat));
	N1 = DAG.getUNDEF(ConcatVT);
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
	}

	// Look at every vector that's inserted. We're looking for exact
	// subvector-sized copies from a concatenated vector
	for (unsigned I = 0; I != NumConcats; ++I) {
	// Make sure we're dealing with a copy.
	unsigned Begin = I * NumElemsPerConcat;
	bool AllUndef = true, NoUndef = true;
	for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) {
	if (SVN->getMaskElt(J) >= 0)
	AllUndef = false;
	else
	NoUndef = false;
	}

	if (NoUndef) {
	if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0)
	return SDValue();

	for (unsigned J = 1; J != NumElemsPerConcat; ++J)
	if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J))
	return SDValue();

	unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat;
	if (FirstElt < N0.getNumOperands())
	Ops.push_back(N0.getOperand(FirstElt));
	else
	Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands()));

	} else if (AllUndef) {
	Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType()));
	} else { // Mixed with general masks and undefs, can't do optimization.
	return SDValue();
	}
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
	// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
	//
	// SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
	// a simplification in some sense, but it isn't appropriate in general: some
	// BUILD_VECTORs are substantially cheaper than others. The general case
	// of a BUILD_VECTOR requires inserting each element individually (or
	// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
	// all constants is a single constant pool load. A BUILD_VECTOR where each
	// element is identical is a splat. A BUILD_VECTOR where most of the operands
	// are undef lowers to a small number of element insertions.
	//
	// To deal with this, we currently use a bunch of mostly arbitrary heuristics.
	// We don't fold shuffles where one side is a non-zero constant, and we don't
	// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
	// non-constant operands. This seems to work out reasonably well in practice.
	static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	EVT VT = SVN->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	SDValue N0 = SVN->getOperand(0);
	SDValue N1 = SVN->getOperand(1);

	if (!N0->hasOneUse() \|\| !N1->hasOneUse())
	return SDValue();

	// If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
	// discussed above.
	if (!N1.isUndef()) {
	bool N0AnyConst = isAnyConstantBuildVector(N0.getNode());
	bool N1AnyConst = isAnyConstantBuildVector(N1.getNode());
	if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
	return SDValue();
	if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
	return SDValue();
	}

	// If both inputs are splats of the same value then we can safely merge this
	// to a single BUILD_VECTOR with undef elements based on the shuffle mask.
	bool IsSplat = false;
	auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
	auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
	if (BV0 && BV1)
	if (SDValue Splat0 = BV0->getSplatValue())
	IsSplat = (Splat0 == BV1->getSplatValue());

	SmallVector<SDValue, 8> Ops;
	SmallSet<SDValue, 16> DuplicateOps;
	for (int M : SVN->getMask()) {
	SDValue Op = DAG.getUNDEF(VT.getScalarType());
	if (M >= 0) {
	int Idx = M < (int)NumElts ? M : M - NumElts;
	SDValue &S = (M < (int)NumElts ? N0 : N1);
	if (S.getOpcode() == ISD::BUILD_VECTOR) {
	Op = S.getOperand(Idx);
	} else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index.");
	Op = S.getOperand(0);
	} else {
	// Operand can't be combined - bail out.
	return SDValue();
	}
	}

	// Don't duplicate a non-constant BUILD_VECTOR operand unless we're
	// generating a splat; semantically, this is fine, but it's likely to
	// generate low-quality code if the target can't reconstruct an appropriate
	// shuffle.
	if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
	if (!IsSplat && !DuplicateOps.insert(Op).second)
	return SDValue();

	Ops.push_back(Op);
	}

	// BUILD_VECTOR requires all inputs to be of the same type, find the
	// maximum type and extend them all.
	EVT SVT = VT.getScalarType();
	if (SVT.isInteger())
	for (SDValue &Op : Ops)
	SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
	if (SVT != VT.getScalarType())
	for (SDValue &Op : Ops)
	Op = TLI.isZExtFree(Op.getValueType(), SVT)
	? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
	: DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
	return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
	}

	// Match shuffles that can be converted to any_vector_extend_in_reg.
	// This is often generated during legalization.
	// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
	// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
	static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG,
	const TargetLowering &TLI,
	bool LegalOperations,
	bool LegalTypes) {
	EVT VT = SVN->getValueType(0);
	bool IsBigEndian = DAG.getDataLayout().isBigEndian();

	// TODO Add support for big-endian when we have a test case.
	if (!VT.isInteger() \|\| IsBigEndian)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	ArrayRef<int> Mask = SVN->getMask();
	SDValue N0 = SVN->getOperand(0);

	// shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
	auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] < 0)
	continue;
	if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
	continue;
	return false;
	}
	return true;
	};

	// Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
	// power-of-2 extensions as they are the most likely.
	for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
	// Check for non power of 2 vector sizes
	if (NumElts % Scale != 0)
	continue;
	if (!isAnyExtend(Scale))
	continue;

	EVT OutSVT = EVT::getIntegerVT(DAG.getContext(), EltSizeInBits Scale);
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
	if (!LegalTypes \|\| TLI.isTypeLegal(OutVT))
	if (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
	return DAG.getBitcast(VT,
	DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
	}

	return SDValue();
	}

	// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
	// each source element of a large type into the lowest elements of a smaller
	// destination type. This is often generated during legalization.
	// If the source node itself was a '*_extend_vector_inreg' node then we should
	// then be able to remove it.
	static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG) {
	EVT VT = SVN->getValueType(0);
	bool IsBigEndian = DAG.getDataLayout().isBigEndian();

	// TODO Add support for big-endian when we have a test case.
	if (!VT.isInteger() \|\| IsBigEndian)
	return SDValue();

	SDValue N0 = peekThroughBitcast(SVN->getOperand(0));

	unsigned Opcode = N0.getOpcode();
	if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
	Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
	Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	ArrayRef<int> Mask = SVN->getMask();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
	unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();

	if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
	return SDValue();
	unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;

	// (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
	// (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
	// (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
	auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] < 0)
	continue;
	if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
	continue;
	return false;
	}
	return true;
	};

	// At the moment we just handle the case where we've truncated back to the
	// same size as before the extension.
	// TODO: handle more extension/truncation cases as cases arise.
	if (EltSizeInBits != ExtSrcSizeInBits)
	return SDValue();

	// We can remove *extend_vector_inreg only if the truncation happens at
	// the same scale as the extension.
	if (isTruncate(ExtScale))
	return DAG.getBitcast(VT, N00);

	return SDValue();
	}

	// Combine shuffles of splat-shuffles of the form:
	// shuffle (shuffle V, undef, splat-mask), undef, M
	// If splat-mask contains undef elements, we need to be careful about
	// introducing undef's in the folded mask which are not the result of composing
	// the masks of the shuffles.
	static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
	ShuffleVectorSDNode *Splat,
	SelectionDAG &DAG) {
	ArrayRef<int> SplatMask = Splat->getMask();
	assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");

	// Prefer simplifying to the splat-shuffle, if possible. This is legal if
	// every undef mask element in the splat-shuffle has a corresponding undef
	// element in the user-shuffle's mask or if the composition of mask elements
	// would result in undef.
	// Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
	// * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
	// In this case it is not legal to simplify to the splat-shuffle because we
	// may be exposing the users of the shuffle an undef element at index 1
	// which was not there before the combine.
	// * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
	// In this case the composition of masks yields SplatMask, so it's ok to
	// simplify to the splat-shuffle.
	// * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
	// In this case the composed mask includes all undef elements of SplatMask
	// and in addition sets element zero to undef. It is safe to simplify to
	// the splat-shuffle.
	auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
	ArrayRef<int> SplatMask) {
	for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
	if (UserMask[i] != -1 && SplatMask[i] == -1 &&
	SplatMask[UserMask[i]] != -1)
	return false;
	return true;
	};
	if (CanSimplifyToExistingSplat(UserMask, SplatMask))
	return SDValue(Splat, 0);

	// Create a new shuffle with a mask that is composed of the two shuffles'
	// masks.
	SmallVector<int, 32> NewMask;
	for (int Idx : UserMask)
	NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);

	return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
	Splat->getOperand(0), Splat->getOperand(1),
	NewMask);
	}

	/// If the shuffle mask is taking exactly one element from the first vector
	/// operand and passing through all other elements from the second vector
	/// operand, return the index of the mask element that is choosing an element
	/// from the first operand. Otherwise, return -1.
	static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
	int MaskSize = Mask.size();
	int EltFromOp0 = -1;
	// TODO: This does not match if there are undef elements in the shuffle mask.
	// Should we ignore undefs in the shuffle mask instead? The trade-off is
	// removing an instruction (a shuffle), but losing the knowledge that some
	// vector lanes are not needed.
	for (int i = 0; i != MaskSize; ++i) {
	if (Mask[i] >= 0 && Mask[i] < MaskSize) {
	// We're looking for a shuffle of exactly one element from operand 0.
	if (EltFromOp0 != -1)
	return -1;
	EltFromOp0 = i;
	} else if (Mask[i] != i + MaskSize) {
	// Nothing from operand 1 can change lanes.
	return -1;
	}
	}
	return EltFromOp0;
	}

	/// If a shuffle inserts exactly one element from a source vector operand into
	/// another vector operand and we can access the specified element as a scalar,
	/// then we can eliminate the shuffle.
	static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
	SelectionDAG &DAG) {
	// First, check if we are taking one element of a vector and shuffling that
	// element into another vector.
	ArrayRef<int> Mask = Shuf->getMask();
	SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
	SDValue Op0 = Shuf->getOperand(0);
	SDValue Op1 = Shuf->getOperand(1);
	int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
	if (ShufOp0Index == -1) {
	// Commute mask and check again.
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
	if (ShufOp0Index == -1)
	return SDValue();
	// Commute operands to match the commuted shuffle mask.
	std::swap(Op0, Op1);
	Mask = CommutedMask;
	}

	// The shuffle inserts exactly one element from operand 0 into operand 1.
	// Now see if we can access that element as a scalar via a real insert element
	// instruction.
	// TODO: We can try harder to locate the element as a scalar. Examples: it
	// could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
	assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
	"Shuffle mask value must be from operand 0");
	if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
	if (!InsIndexC \|\| InsIndexC->getSExtValue() != Mask[ShufOp0Index])
	return SDValue();

	// There's an existing insertelement with constant insertion index, so we
	// don't need to check the legality/profitability of a replacement operation
	// that differs at most in the constant value. The target should be able to
	// lower any of those in a similar way. If not, legalization will expand this
	// to a scalar-to-vector plus shuffle.
	//
	// Note that the shuffle may move the scalar from the position that the insert
	// element used. Therefore, our new insert element occurs at the shuffle's
	// mask index value, not the insert's index value.
	// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
	SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
	Op0.getOperand(2).getValueType());
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
	Op1, Op0.getOperand(1), NewInsIndex);
	}

	SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
	EVT VT = N->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");

	// Canonicalize shuffle undef, undef -> undef
	if (N0.isUndef() && N1.isUndef())
	return DAG.getUNDEF(VT);

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);

	// Canonicalize shuffle v, v -> v, undef
	if (N0 == N1) {
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= (int)NumElts) Idx -= NumElts;
	NewMask.push_back(Idx);
	}
	return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
	}

	// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
	if (N0.isUndef())
	return DAG.getCommutedVectorShuffle(*SVN);

	// Remove references to rhs if it is undef
	if (N1.isUndef()) {
	bool Changed = false;
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= (int)NumElts) {
	Idx = -1;
	Changed = true;
	}
	NewMask.push_back(Idx);
	}
	if (Changed)
	return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
	}

	// Simplify shuffle mask if a referenced element is UNDEF.
	if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG))
	return V;

	if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
	return InsElt;

	// A shuffle of a single vector that is a splat can always be folded.
	if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
	if (N1->isUndef() && N0Shuf->isSplat())
	return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);

	// If it is a splat, check if the argument vector is another splat or a
	// build_vector.
	if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
	SDNode *V = N0.getNode();

	// If this is a bit convert that changes the element type of the vector but
	// not the number of vector elements, look through it. Be careful not to
	// look though conversions that change things like v4f32 to v2f64.
	if (V->getOpcode() == ISD::BITCAST) {
	SDValue ConvInput = V->getOperand(0);
	if (ConvInput.getValueType().isVector() &&
	ConvInput.getValueType().getVectorNumElements() == NumElts)
	V = ConvInput.getNode();
	}

	if (V->getOpcode() == ISD::BUILD_VECTOR) {
	assert(V->getNumOperands() == NumElts &&
	"BUILD_VECTOR has wrong number of operands");
	SDValue Base;
	bool AllSame = true;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!V->getOperand(i).isUndef()) {
	Base = V->getOperand(i);
	break;
	}
	}
	// Splat of <u, u, u, u>, return <u, u, u, u>
	if (!Base.getNode())
	return N0;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (V->getOperand(i) != Base) {
	AllSame = false;
	break;
	}
	}
	// Splat of <x, x, x, x>, return <x, x, x, x>
	if (AllSame)
	return N0;

	// Canonicalize any other splat as a build_vector.
	const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
	SmallVector<SDValue, 8> Ops(NumElts, Splatted);
	SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);

	// We may have jumped through bitcasts, so the type of the
	// BUILD_VECTOR may not match the type of the shuffle.
	if (V->getValueType(0) != VT)
	NewBV = DAG.getBitcast(VT, NewBV);
	return NewBV;
	}
	}

	// There are various patterns used to build up a vector from smaller vectors,
	// subvectors, or elements. Scan chains of these and replace unused insertions
	// or components with undef.
	if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
	return S;

	// Match shuffles that can be converted to any_vector_extend_in_reg.
	if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))
	return V;

	// Combine "truncate_vector_in_reg" style shuffles.
	if (SDValue V = combineTruncationShuffle(SVN, DAG))
	return V;

	if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
	Level < AfterLegalizeVectorOps &&
	(N1.isUndef() \|\|
	(N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
	if (SDValue V = partitionShuffleOfConcats(N, DAG))
	return V;
	}

	// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
	// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
	if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
	if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
	return Res;

	// If this shuffle only has a single input that is a bitcasted shuffle,
	// attempt to merge the 2 shuffles and suitably bitcast the inputs/output
	// back to their original types.
	if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
	N1.isUndef() && Level < AfterLegalizeVectorOps &&
	TLI.isTypeLegal(VT)) {

	// Peek through the bitcast only if there is one user.
	SDValue BC0 = N0;
	while (BC0.getOpcode() == ISD::BITCAST) {
	if (!BC0.hasOneUse())
	break;
	BC0 = BC0.getOperand(0);
	}

	auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
	if (Scale == 1)
	return SmallVector<int, 8>(Mask.begin(), Mask.end());

	SmallVector<int, 8> NewMask;
	for (int M : Mask)
	for (int s = 0; s != Scale; ++s)
	NewMask.push_back(M < 0 ? -1 : Scale * M + s);
	return NewMask;
	};

	if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
	EVT SVT = VT.getScalarType();
	EVT InnerVT = BC0->getValueType(0);
	EVT InnerSVT = InnerVT.getScalarType();

	// Determine which shuffle works with the smaller scalar type.
	EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
	EVT ScaleSVT = ScaleVT.getScalarType();

	if (TLI.isTypeLegal(ScaleVT) &&
	0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
	0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
	int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
	int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();

	// Scale the shuffle masks to the smaller scalar type.
	ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
	SmallVector<int, 8> InnerMask =
	ScaleShuffleMask(InnerSVN->getMask(), InnerScale);
	SmallVector<int, 8> OuterMask =
	ScaleShuffleMask(SVN->getMask(), OuterScale);

	// Merge the shuffle masks.
	SmallVector<int, 8> NewMask;
	for (int M : OuterMask)
	NewMask.push_back(M < 0 ? -1 : InnerMask[M]);

	// Test for shuffle mask legality over both commutations.
	SDValue SV0 = BC0->getOperand(0);
	SDValue SV1 = BC0->getOperand(1);
	bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
	if (!LegalMask) {
	std::swap(SV0, SV1);
	ShuffleVectorSDNode::commuteMask(NewMask);
	LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
	}

	if (LegalMask) {
	SV0 = DAG.getBitcast(ScaleVT, SV0);
	SV1 = DAG.getBitcast(ScaleVT, SV1);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
	}
	}
	}
	}

	// Canonicalize shuffles according to rules:
	// shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
	// shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
	// shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
	if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
	N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
	TLI.isTypeLegal(VT)) {
	// The incoming shuffle must be of the same type as the result of the
	// current shuffle.
	assert(N1->getOperand(0).getValueType() == VT &&
	"Shuffle types don't match");

	SDValue SV0 = N1->getOperand(0);
	SDValue SV1 = N1->getOperand(1);
	bool HasSameOp0 = N0 == SV0;
	bool IsSV1Undef = SV1.isUndef();
	if (HasSameOp0 \|\| IsSV1Undef \|\| N0 == SV1)
	// Commute the operands of this shuffle so that next rule
	// will trigger.
	return DAG.getCommutedVectorShuffle(*SVN);
	}

	// Try to fold according to rules:
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
	// Don't try to fold shuffles with illegal type.
	// Only fold if this shuffle is the only user of the other shuffle.
	if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
	Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
	ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);

	// Don't try to fold splats; they're likely to simplify somehow, or they
	// might be free.
	if (OtherSV->isSplat())
	return SDValue();

	// The incoming shuffle must be of the same type as the result of the
	// current shuffle.
	assert(OtherSV->getOperand(0).getValueType() == VT &&
	"Shuffle types don't match");

	SDValue SV0, SV1;
	SmallVector<int, 4> Mask;
	// Compute the combined shuffle mask for a shuffle with SV0 as the first
	// operand, and SV1 as the second operand.
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx < 0) {
	// Propagate Undef.
	Mask.push_back(Idx);
	continue;
	}

	SDValue CurrentVec;
	if (Idx < (int)NumElts) {
	// This shuffle index refers to the inner shuffle N0. Lookup the inner
	// shuffle mask to identify which vector is actually referenced.
	Idx = OtherSV->getMaskElt(Idx);
	if (Idx < 0) {
	// Propagate Undef.
	Mask.push_back(Idx);
	continue;
	}

	CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
	: OtherSV->getOperand(1);
	} else {
	// This shuffle index references an element within N1.
	CurrentVec = N1;
	}

	// Simple case where 'CurrentVec' is UNDEF.
	if (CurrentVec.isUndef()) {
	Mask.push_back(-1);
	continue;
	}

	// Canonicalize the shuffle index. We don't know yet if CurrentVec
	// will be the first or second operand of the combined shuffle.
	Idx = Idx % NumElts;
	if (!SV0.getNode() \|\| SV0 == CurrentVec) {
	// Ok. CurrentVec is the left hand side.
	// Update the mask accordingly.
	SV0 = CurrentVec;
	Mask.push_back(Idx);
	continue;
	}

	// Bail out if we cannot convert the shuffle pair into a single shuffle.
	if (SV1.getNode() && SV1 != CurrentVec)
	return SDValue();

	// Ok. CurrentVec is the right hand side.
	// Update the mask accordingly.
	SV1 = CurrentVec;
	Mask.push_back(Idx + NumElts);
	}

	// Check if all indices in Mask are Undef. In case, propagate Undef.
	bool isUndefMask = true;
	for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
	isUndefMask &= Mask[i] < 0;

	if (isUndefMask)
	return DAG.getUNDEF(VT);

	if (!SV0.getNode())
	SV0 = DAG.getUNDEF(VT);
	if (!SV1.getNode())
	SV1 = DAG.getUNDEF(VT);

	// Avoid introducing shuffles with illegal mask.
	if (!TLI.isShuffleMaskLegal(Mask, VT)) {
	ShuffleVectorSDNode::commuteMask(Mask);

	if (!TLI.isShuffleMaskLegal(Mask, VT))
	return SDValue();

	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
	std::swap(SV0, SV1);
	}

	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
	return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
	SDValue InVal = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
	// with a VECTOR_SHUFFLE and possible truncate.
	if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue InVec = InVal->getOperand(0);
	SDValue EltNo = InVal->getOperand(1);
	auto InVecT = InVec.getValueType();
	if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
	SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
	int Elt = C0->getZExtValue();
	NewMask[0] = Elt;
	SDValue Val;
	// If we have an implict truncate do truncate here as long as it's legal.
	// if it's not legal, this should
	if (VT.getScalarType() != InVal.getValueType() &&
	InVal.getValueType().isScalarInteger() &&
	isTypeLegal(VT.getScalarType())) {
	Val =
	DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
	}
	if (VT.getScalarType() == InVecT.getScalarType() &&
	VT.getVectorNumElements() <= InVecT.getVectorNumElements() &&
	TLI.isShuffleMaskLegal(NewMask, VT)) {
	Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec,
	DAG.getUNDEF(InVecT), NewMask);
	// If the initial vector is the correct size this shuffle is a
	// valid result.
	if (VT == InVecT)
	return Val;
	// If not we must truncate the vector.
	if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
	EVT SubVT =
	EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
	VT.getVectorNumElements());
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val,
	ZeroIdx);
	return Val;
	}
	}
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);

	// If inserting an UNDEF, just return the original vector.
	if (N1.isUndef())
	return N0;

	// For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow
	// us to pull BITCASTs from input to output.
	if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR)
	if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2);

	// If this is an insert of an extracted vector into an undef vector, we can
	// just use the input to the extract.
	if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
	return N1.getOperand(0);

	// If we are inserting a bitcast value into an undef, with the same
	// number of elements, just use the bitcast input of the extract.
	// i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
	// BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
	if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
	N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOperand(0).getOperand(1) == N2 &&
	N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
	VT.getVectorNumElements()) {
	return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
	}

	// If both N1 and N2 are bitcast values on which insert_subvector
	// would makes sense, pull the bitcast through.
	// i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
	// BITCAST (INSERT_SUBVECTOR N0 N1 N2)
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
	SDValue CN0 = N0.getOperand(0);
	SDValue CN1 = N1.getOperand(0);
	if (CN0.getValueType().getVectorElementType() ==
	CN1.getValueType().getVectorElementType() &&
	CN0.getValueType().getVectorNumElements() ==
	VT.getVectorNumElements()) {
	SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
	CN0.getValueType(), CN0, CN1, N2);
	return DAG.getBitcast(VT, NewINSERT);
	}
	}

	// Combine INSERT_SUBVECTORs where we are inserting to the same index.
	// INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
	// --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
	if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
	N0.getOperand(1).getValueType() == N1.getValueType() &&
	N0.getOperand(2) == N2)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
	N1, N2);

	if (!isa<ConstantSDNode>(N2))
	return SDValue();

	unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();

	// Canonicalize insert_subvector dag nodes.
	// Example:
	// (insert_subvector (insert_subvector A, Idx0), Idx1)
	// -> (insert_subvector (insert_subvector A, Idx1), Idx0)
	if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
	N1.getValueType() == N0.getOperand(1).getValueType() &&
	isa<ConstantSDNode>(N0.getOperand(2))) {
	unsigned OtherIdx = N0.getConstantOperandVal(2);
	if (InsIdx < OtherIdx) {
	// Swap nodes.
	SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
	N0.getOperand(0), N1, N2);
	AddToWorklist(NewOp.getNode());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
	VT, NewOp, N0.getOperand(1), N0.getOperand(2));
	}
	}

	// If the input vector is a concatenation, and the insert replaces
	// one of the pieces, we can optimize into a single concat_vectors.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
	N0.getOperand(0).getValueType() == N1.getValueType()) {
	unsigned Factor = N1.getValueType().getVectorNumElements();

	SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
	Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// fold (fp_to_fp16 (fp16_to_fp op)) -> op
	if (N0->getOpcode() == ISD::FP16_TO_FP)
	return N0->getOperand(0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
	if (N0->getOpcode() == ISD::AND) {
	ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
	if (AndConst && AndConst->getAPIntValue() == 0xffff) {
	return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
	N0.getOperand(0));
	}
	}

	return SDValue();
	}

	/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
	/// with the destination vector and a zero vector.
	/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
	/// vector_shuffle V, Zero, <0, 4, 2, 4>
	SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = peekThroughBitcast(N->getOperand(1));
	SDLoc DL(N);

	// Make sure we're not running after operation legalization where it
	// may have custom lowered the vector shuffles.
	if (LegalOperations)
	return SDValue();

	if (RHS.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	EVT RVT = RHS.getValueType();
	unsigned NumElts = RHS.getNumOperands();

	// Attempt to create a valid clear mask, splitting the mask into
	// sub elements and checking to see if each is
	// all zeros or all ones - suitable for shuffle masking.
	auto BuildClearMask = [&](int Split) {
	int NumSubElts = NumElts * Split;
	int NumSubBits = RVT.getScalarSizeInBits() / Split;

	SmallVector<int, 8> Indices;
	for (int i = 0; i != NumSubElts; ++i) {
	int EltIdx = i / Split;
	int SubIdx = i % Split;
	SDValue Elt = RHS.getOperand(EltIdx);
	if (Elt.isUndef()) {
	Indices.push_back(-1);
	continue;
	}

	APInt Bits;
	if (isa<ConstantSDNode>(Elt))
	Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
	else if (isa<ConstantFPSDNode>(Elt))
	Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
	else
	return SDValue();

	// Extract the sub element from the constant bit mask.
	if (DAG.getDataLayout().isBigEndian()) {
	Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits);
	} else {
	Bits.lshrInPlace(SubIdx * NumSubBits);
	}

	if (Split > 1)
	Bits = Bits.trunc(NumSubBits);

	if (Bits.isAllOnesValue())
	Indices.push_back(i);
	else if (Bits == 0)
	Indices.push_back(i + NumSubElts);
	else
	return SDValue();
	}

	// Let's see if the target supports this vector_shuffle.
	EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
	EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
	if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
	return SDValue();

	SDValue Zero = DAG.getConstant(0, DL, ClearVT);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
	DAG.getBitcast(ClearVT, LHS),
	Zero, Indices));
	};

	// Determine maximum split level (byte level masking).
	int MaxSplit = 1;
	if (RVT.getScalarSizeInBits() % 8 == 0)
	MaxSplit = RVT.getScalarSizeInBits() / 8;

	for (int Split = 1; Split <= MaxSplit; ++Split)
	if (RVT.getScalarSizeInBits() % Split == 0)
	if (SDValue S = BuildClearMask(Split))
	return S;

	return SDValue();
	}

	/// Visit a binary vector operation, like ADD.
	SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	"SimplifyVBinOp only works on vectors!");

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Ops[] = {LHS, RHS};

	// See if we can constant fold the vector operation.
	if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
	N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
	return Fold;

	// Type legalization might introduce new shuffles in the DAG.
	// Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
	// -> (shuffle (VBinOp (A, B)), Undef, Mask).
	if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
	isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
	LHS.getOperand(1).isUndef() &&
	RHS.getOperand(1).isUndef()) {
	ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
	ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);

	if (SVN0->getMask().equals(SVN1->getMask())) {
	EVT VT = N->getValueType(0);
	SDValue UndefVector = LHS.getOperand(1);
	SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	LHS.getOperand(0), RHS.getOperand(0),
	N->getFlags());
	AddUsersToWorklist(N);
	return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
	SVN0->getMask());
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2) {
	assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");

	SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
	cast<CondCodeSDNode>(N0.getOperand(2))->get());

	// If we got a simplified select_cc node back from SimplifySelectCC, then
	// break it down into a new SETCC node, and a new SELECT node, and then return
	// the SELECT node, since we were called with a SELECT node.
	if (SCC.getNode()) {
	// Check to see if we got a select_cc back (to turn into setcc/select).
	// Otherwise, just return whatever node we got back, like fabs.
	if (SCC.getOpcode() == ISD::SELECT_CC) {
	SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
	N0.getValueType(),
	SCC.getOperand(0), SCC.getOperand(1),
	SCC.getOperand(4));
	AddToWorklist(SETCC.getNode());
	return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
	SCC.getOperand(2), SCC.getOperand(3));
	}

	return SCC;
	}
	return SDValue();
	}

	/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
	/// being selected between, see if we can simplify the select. Callers of this
	/// should assume that TheSelect is deleted if this returns true. As such, they
	/// should return the appropriate thing (e.g. the node) back to the top-level of
	/// the DAG combiner loop to avoid it being looked at.
	bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
	SDValue RHS) {
	// fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
	// The select + setcc is redundant, because fsqrt returns NaN for X < 0.
	if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
	if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
	// We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
	SDValue Sqrt = RHS;
	ISD::CondCode CC;
	SDValue CmpLHS;
	const ConstantFPSDNode *Zero = nullptr;

	if (TheSelect->getOpcode() == ISD::SELECT_CC) {
	CC = dyn_cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
	CmpLHS = TheSelect->getOperand(0);
	Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
	} else {
	// SELECT or VSELECT
	SDValue Cmp = TheSelect->getOperand(0);
	if (Cmp.getOpcode() == ISD::SETCC) {
	CC = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
	CmpLHS = Cmp.getOperand(0);
	Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
	}
	}
	if (Zero && Zero->isZero() &&
	Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT \|\|
	CC == ISD::SETULT \|\| CC == ISD::SETLT)) {
	// We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
	CombineTo(TheSelect, Sqrt);
	return true;
	}
	}
	}
	// Cannot simplify select with vector condition
	if (TheSelect->getOperand(0).getValueType().isVector()) return false;

	// If this is a select from two identical things, try to pull the operation
	// through the select.
	if (LHS.getOpcode() != RHS.getOpcode() \|\|
	!LHS.hasOneUse() \|\| !RHS.hasOneUse())
	return false;

	// If this is a load and the token chain is identical, replace the select
	// of two loads with a load through a select of the address to load from.
	// This triggers in things like "select bool X, 10.0, 123.0" after the FP
	// constants have been dropped into the constant pool.
	if (LHS.getOpcode() == ISD::LOAD) {
	LoadSDNode *LLD = cast<LoadSDNode>(LHS);
	LoadSDNode *RLD = cast<LoadSDNode>(RHS);

	// Token chains must be identical.
	if (LHS.getOperand(0) != RHS.getOperand(0) \|\|
	// Do not let this transformation reduce the number of volatile loads.
	LLD->isVolatile() \|\| RLD->isVolatile() \|\|
	// FIXME: If either is a pre/post inc/dec load,
	// we'd need to split out the address adjustment.
	LLD->isIndexed() \|\| RLD->isIndexed() \|\|
	// If this is an EXTLOAD, the VT's must match.
	LLD->getMemoryVT() != RLD->getMemoryVT() \|\|
	// If this is an EXTLOAD, the kind of extension must match.
	(LLD->getExtensionType() != RLD->getExtensionType() &&
	// The only exception is if one of the extensions is anyext.
	LLD->getExtensionType() != ISD::EXTLOAD &&
	RLD->getExtensionType() != ISD::EXTLOAD) \|\|
	// FIXME: this discards src value information. This is
	// over-conservative. It would be beneficial to be able to remember
	// both potential memory locations. Since we are discarding
	// src value info, don't do the transformation if the memory
	// locations are not in the default address space.
	LLD->getPointerInfo().getAddrSpace() != 0 \|\|
	RLD->getPointerInfo().getAddrSpace() != 0 \|\|
	!TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
	LLD->getBasePtr().getValueType()))
	return false;

	// Check that the select condition doesn't reach either load. If so,
	// folding this will induce a cycle into the DAG. If not, this is safe to
	// xform, so create a select of the addresses.
	SDValue Addr;
	if (TheSelect->getOpcode() == ISD::SELECT) {
	SDNode *CondNode = TheSelect->getOperand(0).getNode();
	if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) \|\|
	(RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
	return false;
	// The loads must not depend on one another.
	if (LLD->isPredecessorOf(RLD) \|\|
	RLD->isPredecessorOf(LLD))
	return false;
	Addr = DAG.getSelect(SDLoc(TheSelect),
	LLD->getBasePtr().getValueType(),
	TheSelect->getOperand(0), LLD->getBasePtr(),
	RLD->getBasePtr());
	} else { // Otherwise SELECT_CC
	SDNode *CondLHS = TheSelect->getOperand(0).getNode();
	SDNode *CondRHS = TheSelect->getOperand(1).getNode();

	if ((LLD->hasAnyUseOfValue(1) &&
	(LLD->isPredecessorOf(CondLHS) \|\| LLD->isPredecessorOf(CondRHS))) \|\|
	(RLD->hasAnyUseOfValue(1) &&
	(RLD->isPredecessorOf(CondLHS) \|\| RLD->isPredecessorOf(CondRHS))))
	return false;

	Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
	LLD->getBasePtr().getValueType(),
	TheSelect->getOperand(0),
	TheSelect->getOperand(1),
	LLD->getBasePtr(), RLD->getBasePtr(),
	TheSelect->getOperand(4));
	}

	SDValue Load;
	// It is safe to replace the two loads if they have different alignments,
	// but the new load must be the minimum (most restrictive) alignment of the
	// inputs.
	unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
	MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
	if (!RLD->isInvariant())
	MMOFlags &= ~MachineMemOperand::MOInvariant;
	if (!RLD->isDereferenceable())
	MMOFlags &= ~MachineMemOperand::MODereferenceable;
	if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
	// FIXME: Discards pointer and AA info.
	Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
	LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
	MMOFlags);
	} else {
	// FIXME: Discards pointer and AA info.
	Load = DAG.getExtLoad(
	LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
	: LLD->getExtensionType(),
	SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
	MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
	}

	// Users of the select now use the result of the load.
	CombineTo(TheSelect, Load);

	// Users of the old loads now use the new load's chain. We know the
	// old-load value is dead now.
	CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
	CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
	return true;
	}

	return false;
	}

	/// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
	/// bitwise 'and'.
	SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
	SDValue N1, SDValue N2, SDValue N3,
	ISD::CondCode CC) {
	// If this is a select where the false operand is zero and the compare is a
	// check of the sign bit, see if we can perform the "gzip trick":
	// select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
	// select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
	EVT XType = N0.getValueType();
	EVT AType = N2.getValueType();
	if (!isNullConstant(N3) \|\| !XType.bitsGE(AType))
	return SDValue();

	// If the comparison is testing for a positive value, we have to invert
	// the sign bit mask, so only do that transform if the target has a bitwise
	// 'and not' instruction (the invert is free).
	if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
	// (X > -1) ? A : 0
	// (X > 0) ? X : 0 <-- This is canonical signed max.
	if (!(isAllOnesConstant(N1) \|\| (isNullConstant(N1) && N0 == N2)))
	return SDValue();
	} else if (CC == ISD::SETLT) {
	// (X < 0) ? A : 0
	// (X < 1) ? X : 0 <-- This is un-canonicalized signed min.
	if (!(isNullConstant(N1) \|\| (isOneConstant(N1) && N0 == N2)))
	return SDValue();
	} else {
	return SDValue();
	}

	// and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
	// constant.
	EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
	auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
	if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
	unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
	SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
	SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
	AddToWorklist(Shift.getNode());

	if (XType.bitsGT(AType)) {
	Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
	AddToWorklist(Shift.getNode());
	}

	if (CC == ISD::SETGT)
	Shift = DAG.getNOT(DL, Shift, AType);

	return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
	}

	SDValue ShiftAmt = DAG.getConstant(XType.getSizeInBits() - 1, DL, ShiftAmtTy);
	SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
	AddToWorklist(Shift.getNode());

	if (XType.bitsGT(AType)) {
	Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
	AddToWorklist(Shift.getNode());
	}

	if (CC == ISD::SETGT)
	Shift = DAG.getNOT(DL, Shift, AType);

	return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
	}

	/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
	/// where 'cond' is the comparison specified by CC.
	SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC,
	bool NotExtCompare) {
	// (x ? y : y) -> y.
	if (N2 == N3) return N2;

	EVT VT = N2.getValueType();
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
	ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());

	// Determine if the condition we're dealing with is constant
	SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
	N0, N1, CC, DL, false);
	if (SCC.getNode()) AddToWorklist(SCC.getNode());

	if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
	// fold select_cc true, x, y -> x
	// fold select_cc false, x, y -> y
	return !SCCC->isNullValue() ? N2 : N3;
	}

	// Check to see if we can simplify the select into an fabs node
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
	// Allow either -0.0 or 0.0
	if (CFP->isZero()) {
	// select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
	if ((CC == ISD::SETGE \|\| CC == ISD::SETGT) &&
	N0 == N2 && N3.getOpcode() == ISD::FNEG &&
	N2 == N3.getOperand(0))
	return DAG.getNode(ISD::FABS, DL, VT, N0);

	// select (setl[te] X, +/-0.0), fneg(X), X -> fabs
	if ((CC == ISD::SETLT \|\| CC == ISD::SETLE) &&
	N0 == N3 && N2.getOpcode() == ISD::FNEG &&
	N2.getOperand(0) == N3)
	return DAG.getNode(ISD::FABS, DL, VT, N3);
	}
	}

	// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
	// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
	// in it. This is a win when the constant is not otherwise available because
	// it replaces two constant pool loads with one. We only do this if the FP
	// type is known to be legal, because if it isn't, then we are before legalize
	// types an we want the other legalization to happen first (e.g. to avoid
	// messing with soft float) and if the ConstantFP is not legal, because if
	// it is legal, we may not need to store the FP constant in a constant pool.
	if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
	if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
	if (TLI.isTypeLegal(N2.getValueType()) &&
	(TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
	TargetLowering::Legal &&
	!TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) &&
	!TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) &&
	// If both constants have multiple uses, then we won't need to do an
	// extra load, they are likely around in registers for other users.
	(TV->hasOneUse() \|\| FV->hasOneUse())) {
	Constant *Elts[] = {
	const_cast<ConstantFP*>(FV->getConstantFPValue()),
	const_cast<ConstantFP*>(TV->getConstantFPValue())
	};
	Type *FPTy = Elts[0]->getType();
	const DataLayout &TD = DAG.getDataLayout();

	// Create a ConstantArray of the two constants.
	Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
	SDValue CPIdx =
	DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
	TD.getPrefTypeAlignment(FPTy));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();

	// Get the offsets to the 0 and 1 element of the array so that we can
	// select between them.
	SDValue Zero = DAG.getIntPtrConstant(0, DL);
	unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
	SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));

	SDValue Cond = DAG.getSetCC(DL,
	getSetCCResultType(N0.getValueType()),
	N0, N1, CC);
	AddToWorklist(Cond.getNode());
	SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
	Cond, One, Zero);
	AddToWorklist(CstOffset.getNode());
	CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
	CstOffset);
	AddToWorklist(CPIdx.getNode());
	return DAG.getLoad(
	TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	}
	}

	if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
	return V;

	// fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
	// where y is has a single bit set.
	// A plaintext description would be, we can turn the SELECT_CC into an AND
	// when the condition can be materialized as an all-ones register. Any
	// single bit-test can be materialized as an all-ones register with
	// shift-left and shift-right-arith.
	if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
	N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
	SDValue AndLHS = N0->getOperand(0);
	ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
	// Shift the tested bit over the sign bit.
	const APInt &AndMask = ConstAndRHS->getAPIntValue();
	SDValue ShlAmt =
	DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
	getShiftAmountTy(AndLHS.getValueType()));
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);

	// Now arithmetic right shift it all the way over, so the result is either
	// all-ones, or zero.
	SDValue ShrAmt =
	DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl),
	getShiftAmountTy(Shl.getValueType()));
	SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);

	return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
	}
	}

	// fold select C, 16, 0 -> shl C, 4
	if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() &&
	TLI.getBooleanContents(N0.getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent) {

	// If the caller doesn't want us to simplify this into a zext of a compare,
	// don't do it.
	if (NotExtCompare && N2C->isOne())
	return SDValue();

	// Get a SetCC of the condition
	// NOTE: Don't create a SETCC if it's not legal on this target.
	if (!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) {
	SDValue Temp, SCC;
	// cast from setcc result type to select result type
	if (LegalTypes) {
	SCC = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()),
	N0, N1, CC);
	if (N2.getValueType().bitsLT(SCC.getValueType()))
	Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2),
	N2.getValueType());
	else
	Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
	N2.getValueType(), SCC);
	} else {
	SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
	Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
	N2.getValueType(), SCC);
	}

	AddToWorklist(SCC.getNode());
	AddToWorklist(Temp.getNode());

	if (N2C->isOne())
	return Temp;

	// shl setcc result by log2 n2c
	return DAG.getNode(
	ISD::SHL, DL, N2.getValueType(), Temp,
	DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp),
	getShiftAmountTy(Temp.getValueType())));
	}
	}

	// Check to see if this is an integer abs.
	// select_cc setg[te] X, 0, X, -X ->
	// select_cc setgt X, -1, X, -X ->
	// select_cc setl[te] X, 0, -X, X ->
	// select_cc setlt X, 1, -X, X ->
	// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
	if (N1C) {
	ConstantSDNode *SubC = nullptr;
	if (((N1C->isNullValue() && (CC == ISD::SETGT \|\| CC == ISD::SETGE)) \|\|
	(N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
	N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
	SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
	else if (((N1C->isNullValue() && (CC == ISD::SETLT \|\| CC == ISD::SETLE)) \|\|
	(N1C->isOne() && CC == ISD::SETLT)) &&
	N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
	SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));

	EVT XType = N0.getValueType();
	if (SubC && SubC->isNullValue() && XType.isInteger()) {
	SDLoc DL(N0);
	SDValue Shift = DAG.getNode(ISD::SRA, DL, XType,
	N0,
	DAG.getConstant(XType.getSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));
	SDValue Add = DAG.getNode(ISD::ADD, DL,
	XType, N0, Shift);
	AddToWorklist(Shift.getNode());
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
	}
	}

	// select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
	// select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
	// select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
	// select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
	// select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
	// select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
	// select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
	// select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
	if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	SDValue ValueOnZero = N2;
	SDValue Count = N3;
	// If the condition is NE instead of E, swap the operands.
	if (CC == ISD::SETNE)
	std::swap(ValueOnZero, Count);
	// Check if the value on zero is a constant equal to the bits in the type.
	if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
	if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
	// If the other operand is cttz/cttz_zero_undef of N0, and cttz is
	// legal, combine to just cttz.
	if ((Count.getOpcode() == ISD::CTTZ \|\|
	Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
	N0 == Count.getOperand(0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::CTTZ, VT)))
	return DAG.getNode(ISD::CTTZ, DL, VT, N0);
	// If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
	// legal, combine to just ctlz.
	if ((Count.getOpcode() == ISD::CTLZ \|\|
	Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
	N0 == Count.getOperand(0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::CTLZ, VT)))
	return DAG.getNode(ISD::CTLZ, DL, VT, N0);
	}
	}
	}

	return SDValue();
	}

	/// This is a stub for TargetLowering::SimplifySetCC.
	SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, const SDLoc &DL,
	bool foldBooleans) {
	TargetLowering::DAGCombinerInfo
	DagCombineInfo(DAG, Level, false, this);
	return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
	}

	/// Given an ISD::SDIV node expressing a divide by constant, return
	/// a DAG expression to select that will generate the same value by multiplying
	/// by a magic number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue DAGCombiner::BuildSDIV(SDNode *N) {
	// when optimising for minimum size, we don't want to expand a div to a mul
	// and a shift.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	std::vector<SDNode *> Built;
	SDValue S =
	TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);

	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
	/// DAG expression that will generate the same value by right shifting.
	SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	std::vector<SDNode *> Built;
	SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built);

	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
	/// expression that will generate the same value by multiplying by a magic
	/// number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue DAGCombiner::BuildUDIV(SDNode *N) {
	// when optimising for minimum size, we don't want to expand a div to a mul
	// and a shift.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	std::vector<SDNode *> Built;
	SDValue S =
	TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);

	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	/// Determines the LogBase2 value for a non-null input value using the
	/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
	SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
	EVT VT = V.getValueType();
	unsigned EltBits = VT.getScalarSizeInBits();
	SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
	SDValue Base = DAG.getConstant(EltBits - 1, DL, VT);
	SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
	return LogBase2;
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal, we need to find the zero of the function:
	/// F(X) = A X - 1 [which has a zero at X = 1/A]
	/// =>
	/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
	/// does not require additional intermediate precision]
	SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
	if (Level >= AfterLegalizeDAG)
	return SDValue();

	// TODO: Handle half and/or extended types?
	EVT VT = Op.getValueType();
	if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
	return SDValue();

	// If estimates are explicitly disabled for this function, we're done.
	MachineFunction &MF = DAG.getMachineFunction();
	int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
	if (Enabled == TLI.ReciprocalEstimate::Disabled)
	return SDValue();

	// Estimates may be explicitly enabled for this type with a custom number of
	// refinement steps.
	int Iterations = TLI.getDivRefinementSteps(VT, MF);
	if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
	AddToWorklist(Est.getNode());

	if (Iterations) {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);

	// Newton iterations: Est = Est + Est (1 - Arg * Est)
	for (int i = 0; i < Iterations; ++i) {
	SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
	AddToWorklist(Est.getNode());
	}
	}
	return Est;
	}

	return SDValue();
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal sqrt, we need to find the zero of the function:
	/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
	/// =>
	/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
	/// As a result, we precompute A/2 prior to the iteration loop.
	SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
	unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal) {
	EVT VT = Arg.getValueType();
	SDLoc DL(Arg);
	SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);

	// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
	// this entire sequence requires only one FP constant.
	SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
	AddToWorklist(HalfArg.getNode());

	HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
	AddToWorklist(HalfArg.getNode());

	// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
	for (unsigned i = 0; i < Iterations; ++i) {
	SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
	AddToWorklist(Est.getNode());
	}

	// If non-reciprocal square root is requested, multiply the result by Arg.
	if (!Reciprocal) {
	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
	AddToWorklist(Est.getNode());
	}

	return Est;
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal sqrt, we need to find the zero of the function:
	/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
	/// =>
	/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
	SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
	unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal) {
	EVT VT = Arg.getValueType();
	SDLoc DL(Arg);
	SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
	SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);

	// This routine must enter the loop below to work correctly
	// when (Reciprocal == false).
	assert(Iterations > 0);

	// Newton iterations for reciprocal square root:
	// E = (E * -0.5) * ((A * E) * E + -3.0)
	for (unsigned i = 0; i < Iterations; ++i) {
	SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
	AddToWorklist(AE.getNode());

	SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
	AddToWorklist(AEE.getNode());

	SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
	AddToWorklist(RHS.getNode());

	// When calculating a square root at the last iteration build:
	// S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
	// (notice a common subexpression)
	SDValue LHS;
	if (Reciprocal \|\| (i + 1) < Iterations) {
	// RSQRT: LHS = (E * -0.5)
	LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
	} else {
	// SQRT: LHS = (A * E) * -0.5
	LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
	}
	AddToWorklist(LHS.getNode());

	Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
	AddToWorklist(Est.getNode());
	}

	return Est;
	}

	/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
	/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
	/// Op can be zero.
	SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
	bool Reciprocal) {
	if (Level >= AfterLegalizeDAG)
	return SDValue();

	// TODO: Handle half and/or extended types?
	EVT VT = Op.getValueType();
	if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
	return SDValue();

	// If estimates are explicitly disabled for this function, we're done.
	MachineFunction &MF = DAG.getMachineFunction();
	int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
	if (Enabled == TLI.ReciprocalEstimate::Disabled)
	return SDValue();

	// Estimates may be explicitly enabled for this type with a custom number of
	// refinement steps.
	int Iterations = TLI.getSqrtRefinementSteps(VT, MF);

	bool UseOneConstNR = false;
	if (SDValue Est =
	TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
	Reciprocal)) {
	AddToWorklist(Est.getNode());

	if (Iterations) {
	Est = UseOneConstNR
	? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
	: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);

	if (!Reciprocal) {
	// Unfortunately, Est is now NaN if the input was exactly 0.0.
	// Select out this case and force the answer to 0.0.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	EVT CCVT = getSetCCResultType(VT);
	SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
	AddToWorklist(ZeroCmp.getNode());

	Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
	ZeroCmp, FPZero, Est);
	AddToWorklist(Est.getNode());
	}
	}
	return Est;
	}

	return SDValue();
	}

	SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
	return buildSqrtEstimateImpl(Op, Flags, true);
	}

	SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
	return buildSqrtEstimateImpl(Op, Flags, false);
	}

	/// Return true if there is any possibility that the two addresses overlap.
	bool DAGCombiner::isAlias(LSBaseSDNode Op0, LSBaseSDNode Op1) const {
	// If they are the same then they must be aliases.
	if (Op0->getBasePtr() == Op1->getBasePtr()) return true;

	// If they are both volatile then they cannot be reordered.
	if (Op0->isVolatile() && Op1->isVolatile()) return true;

	// If one operation reads from invariant memory, and the other may store, they
	// cannot alias. These should really be checking the equivalent of mayWrite,
	// but it only matters for memory nodes other than load /store.
	if (Op0->isInvariant() && Op1->writeMem())
	return false;

	if (Op1->isInvariant() && Op0->writeMem())
	return false;

	unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize();
	unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();

	// Check for BaseIndexOffset matching.
	- BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
	- BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG);
	+ BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG);
	+ BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG);
	int64_t PtrDiff;
	- if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
	- return !((NumBytes0 <= PtrDiff) \|\| (PtrDiff + NumBytes1 <= 0));
	+ if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) {
	+ if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
	+ return !((NumBytes0 <= PtrDiff) \|\| (PtrDiff + NumBytes1 <= 0));

	- // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
	- // able to calculate their relative offset if at least one arises
	- // from an alloca. However, these allocas cannot overlap and we
	- // can infer there is no alias.
	- if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
	- if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
	- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	- // If the base are the same frame index but the we couldn't find a
	- // constant offset, (indices are different) be conservative.
	- if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) \|\|
	- !MFI.isFixedObjectIndex(B->getIndex())))
	- return false;
	- }
	+ // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
	+ // able to calculate their relative offset if at least one arises
	+ // from an alloca. However, these allocas cannot overlap and we
	+ // can infer there is no alias.
	+ if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
	+ if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
	+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	+ // If the base are the same frame index but the we couldn't find a
	+ // constant offset, (indices are different) be conservative.
	+ if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) \|\|
	+ !MFI.isFixedObjectIndex(B->getIndex())))
	+ return false;
	+ }

	- bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
	- bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
	- bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
	- bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
	- bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
	- bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
	+ bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
	+ bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
	+ bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
	+ bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
	+ bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
	+ bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());

	- // If of mismatched base types or checkable indices we can check
	- // they do not alias.
	- if ((BasePtr0.getIndex() == BasePtr1.getIndex() \|\| (IsFI0 != IsFI1) \|\|
	- (IsGV0 != IsGV1) \|\| (IsCV0 != IsCV1)) &&
	- (IsFI0 \|\| IsGV0 \|\| IsCV0) && (IsFI1 \|\| IsGV1 \|\| IsCV1))
	- return false;
	+ // If of mismatched base types or checkable indices we can check
	+ // they do not alias.
	+ if ((BasePtr0.getIndex() == BasePtr1.getIndex() \|\| (IsFI0 != IsFI1) \|\|
	+ (IsGV0 != IsGV1) \|\| (IsCV0 != IsCV1)) &&
	+ (IsFI0 \|\| IsGV0 \|\| IsCV0) && (IsFI1 \|\| IsGV1 \|\| IsCV1))
	+ return false;
	+ }

	- // If we know required SrcValue1 and SrcValue2 have relatively large alignment
	- // compared to the size and offset of the access, we may be able to prove they
	- // do not alias. This check is conservative for now to catch cases created by
	- // splitting vector types.
	+ // If we know required SrcValue1 and SrcValue2 have relatively large
	+ // alignment compared to the size and offset of the access, we may be able
	+ // to prove they do not alias. This check is conservative for now to catch
	+ // cases created by splitting vector types.
	int64_t SrcValOffset0 = Op0->getSrcValueOffset();
	int64_t SrcValOffset1 = Op1->getSrcValueOffset();
	unsigned OrigAlignment0 = Op0->getOriginalAlignment();
	unsigned OrigAlignment1 = Op1->getOriginalAlignment();
	if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
	NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) {
	int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
	int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;

	- // There is no overlap between these relatively aligned accesses of similar
	- // size. Return no alias.
	+ // There is no overlap between these relatively aligned accesses of
	+ // similar size. Return no alias.
	if ((OffAlign0 + NumBytes0) <= OffAlign1 \|\|
	(OffAlign1 + NumBytes1) <= OffAlign0)
	return false;
	}

	bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
	? CombinerGlobalAA
	: DAG.getSubtarget().useAA();
	#ifndef NDEBUG
	if (CombinerAAOnlyFunc.getNumOccurrences() &&
	CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
	UseAA = false;
	#endif

	if (UseAA && AA &&
	Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
	// Use alias analysis information.
	int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
	int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
	int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
	AliasResult AAResult =
	AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
	UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
	MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
	UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
	if (AAResult == NoAlias)
	return false;
	}

	// Otherwise we have to assume they alias.
	return true;
	}

	/// Walk up chain skipping non-aliasing memory nodes,
	/// looking for aliasing nodes and adding them to the Aliases vector.
	void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
	SmallVectorImpl<SDValue> &Aliases) {
	SmallVector<SDValue, 8> Chains; // List of chains to visit.
	SmallPtrSet<SDNode *, 16> Visited; // Visited node set.

	// Get alias information for node.
	bool IsLoad = isa<LoadSDNode>(N) && !cast<LSBaseSDNode>(N)->isVolatile();

	// Starting off.
	Chains.push_back(OriginalChain);
	unsigned Depth = 0;

	// Look at each chain and determine if it is an alias. If so, add it to the
	// aliases list. If not, then continue up the chain looking for the next
	// candidate.
	while (!Chains.empty()) {
	SDValue Chain = Chains.pop_back_val();

	// For TokenFactor nodes, look at each operand and only continue up the
	// chain until we reach the depth limit.
	//
	// FIXME: The depth check could be made to return the last non-aliasing
	// chain we found before we hit a tokenfactor rather than the original
	// chain.
	if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
	Aliases.clear();
	Aliases.push_back(OriginalChain);
	return;
	}

	// Don't bother if we've been before.
	if (!Visited.insert(Chain.getNode()).second)
	continue;

	switch (Chain.getOpcode()) {
	case ISD::EntryToken:
	// Entry token is ideal chain operand, but handled in FindBetterChain.
	break;

	case ISD::LOAD:
	case ISD::STORE: {
	// Get alias information for Chain.
	bool IsOpLoad = isa<LoadSDNode>(Chain.getNode()) &&
	!cast<LSBaseSDNode>(Chain.getNode())->isVolatile();

	// If chain is alias then stop here.
	if (!(IsLoad && IsOpLoad) &&
	isAlias(cast<LSBaseSDNode>(N), cast<LSBaseSDNode>(Chain.getNode()))) {
	Aliases.push_back(Chain);
	} else {
	// Look further up the chain.
	Chains.push_back(Chain.getOperand(0));
	++Depth;
	}
	break;
	}

	case ISD::TokenFactor:
	// We have to check each of the operands of the token factor for "small"
	// token factors, so we queue them up. Adding the operands to the queue
	// (stack) in reverse order maintains the original order and increases the
	// likelihood that getNode will find a matching token factor (CSE.)
	if (Chain.getNumOperands() > 16) {
	Aliases.push_back(Chain);
	break;
	}
	for (unsigned n = Chain.getNumOperands(); n;)
	Chains.push_back(Chain.getOperand(--n));
	++Depth;
	break;

	case ISD::CopyFromReg:
	// Forward past CopyFromReg.
	Chains.push_back(Chain.getOperand(0));
	++Depth;
	break;

	default:
	// For all other instructions we will just have to take what we can get.
	Aliases.push_back(Chain);
	break;
	}
	}
	}

	/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
	/// (aliasing node.)
	SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
	if (OptLevel == CodeGenOpt::None)
	return OldChain;

	// Ops for replacing token factor.
	SmallVector<SDValue, 8> Aliases;

	// Accumulate all the aliases to this node.
	GatherAllAliases(N, OldChain, Aliases);

	// If no operands then chain to entry token.
	if (Aliases.size() == 0)
	return DAG.getEntryNode();

	// If a single operand then chain to it. We don't need to revisit it.
	if (Aliases.size() == 1)
	return Aliases[0];

	// Construct a custom tailored token factor.
	return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
	}

	// This function tries to collect a bunch of potentially interesting
	// nodes to improve the chains of, all at once. This might seem
	// redundant, as this function gets called when visiting every store
	// node, so why not let the work be done on each store as it's visited?
	//
	// I believe this is mainly important because MergeConsecutiveStores
	// is unable to deal with merging stores of different sizes, so unless
	// we improve the chains of all the potential candidates up-front
	// before running MergeConsecutiveStores, it might only see some of
	// the nodes that will eventually be candidates, and then not be able
	// to go from a partially-merged state to the desired final
	// fully-merged state.
	bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
	if (OptLevel == CodeGenOpt::None)
	return false;

	// This holds the base pointer, index, and the offset in bytes from the base
	// pointer.
	- BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
	+ BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);

	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return false;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return false;

	SmallVector<StoreSDNode *, 8> ChainedStores;
	ChainedStores.push_back(St);

	// Walk up the chain and look for nodes with offsets from the same
	// base pointer. Stop when reaching an instruction with a different kind
	// or instruction which has a different base pointer.
	StoreSDNode *Index = St;
	while (Index) {
	// If the chain has more than one use, then we can't reorder the mem ops.
	if (Index != St && !SDValue(Index, 0)->hasOneUse())
	break;

	if (Index->isVolatile() \|\| Index->isIndexed())
	break;

	// Find the base pointer and offset for this memory node.
	- BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
	+ BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG);

	// Check that the base pointer is the same as the original one.
	if (!BasePtr.equalBaseIndex(Ptr, DAG))
	break;

	// Walk up the chain to find the next store node, ignoring any
	// intermediate loads. Any other kind of node will halt the loop.
	SDNode *NextInChain = Index->getChain().getNode();
	while (true) {
	if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
	// We found a store node. Use it for the next iteration.
	if (STn->isVolatile() \|\| STn->isIndexed()) {
	Index = nullptr;
	break;
	}
	ChainedStores.push_back(STn);
	Index = STn;
	break;
	} else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
	NextInChain = Ldn->getChain().getNode();
	continue;
	} else {
	Index = nullptr;
	break;
	}
	} // end while
	}

	// At this point, ChainedStores lists all of the Store nodes
	// reachable by iterating up through chain nodes matching the above
	// conditions. For each such store identified, try to find an
	// earlier chain to attach the store to which won't violate the
	// required ordering.
	bool MadeChangeToSt = false;
	SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;

	for (StoreSDNode *ChainedStore : ChainedStores) {
	SDValue Chain = ChainedStore->getChain();
	SDValue BetterChain = FindBetterChain(ChainedStore, Chain);

	if (Chain != BetterChain) {
	if (ChainedStore == St)
	MadeChangeToSt = true;
	BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
	}
	}

	// Do all replacements after finding the replacements to make to avoid making
	// the chains more complicated by introducing new TokenFactors.
	for (auto Replacement : BetterChains)
	replaceStoreChain(Replacement.first, Replacement.second);

	return MadeChangeToSt;
	}

	/// This is the entry point for the file.
	void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
	CodeGenOpt::Level OptLevel) {
	/// This is the main entry point to this class.
	DAGCombiner(*this, AA, OptLevel).Run(Level);
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (revision 328362)
	@@ -1,4708 +1,4708 @@
	//===- LegalizeDAG.cpp - Implement SelectionDAG::Legalize -----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the SelectionDAG::Legalize method.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <tuple>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "legalizedag"

	namespace {

	/// Keeps track of state when getting the sign of a floating-point value as an
	/// integer.
	struct FloatSignAsInt {
	EVT FloatVT;
	SDValue Chain;
	SDValue FloatPtr;
	SDValue IntPtr;
	MachinePointerInfo IntPointerInfo;
	MachinePointerInfo FloatPointerInfo;
	SDValue IntValue;
	APInt SignMask;
	uint8_t SignBit;
	};

	//===----------------------------------------------------------------------===//
	/// This takes an arbitrary SelectionDAG as input and
	/// hacks on it until the target machine can handle it. This involves
	/// eliminating value sizes the machine cannot handle (promoting small sizes to
	/// large sizes or splitting up large values into small values) as well as
	/// eliminating operations the machine cannot handle.
	///
	/// This code also does a small amount of optimization and recognition of idioms
	/// as part of its processing. For example, if a target does not support a
	/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
	/// will attempt merge setcc and brc instructions into brcc's.
	class SelectionDAGLegalize {
	const TargetMachine &TM;
	const TargetLowering &TLI;
	SelectionDAG &DAG;

	/// \brief The set of nodes which have already been legalized. We hold a
	/// reference to it in order to update as necessary on node deletion.
	SmallPtrSetImpl<SDNode *> &LegalizedNodes;

	/// \brief A set of all the nodes updated during legalization.
	SmallSetVector<SDNode , 16> UpdatedNodes;

	EVT getSetCCResultType(EVT VT) const {
	return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	}

	// Libcall insertion helpers.

	public:
	SelectionDAGLegalize(SelectionDAG &DAG,
	SmallPtrSetImpl<SDNode *> &LegalizedNodes,
	SmallSetVector<SDNode , 16> UpdatedNodes = nullptr)
	: TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
	LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}

	/// \brief Legalizes the given operation.
	void LegalizeOp(SDNode *Node);

	private:
	SDValue OptimizeFloatStore(StoreSDNode *ST);

	void LegalizeLoadOps(SDNode *Node);
	void LegalizeStoreOps(SDNode *Node);

	/// Some targets cannot handle a variable
	/// insertion index for the INSERT_VECTOR_ELT instruction. In this case, it
	/// is necessary to spill the vector being inserted into to memory, perform
	/// the insert there, and then read the result back.
	SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
	const SDLoc &dl);
	SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
	const SDLoc &dl);

	/// Return a vector shuffle operation which
	/// performs the same shuffe in terms of order or result bytes, but on a type
	/// whose vector element type is narrower than the original shuffle type.
	/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
	SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, const SDLoc &dl,
	SDValue N1, SDValue N2,
	ArrayRef<int> Mask) const;

	bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
	bool &NeedInvert, const SDLoc &dl);

	SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
	SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
	unsigned NumOps, bool isSigned, const SDLoc &dl);

	std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
	SDNode *Node, bool isSigned);
	SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
	RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
	RTLIB::Libcall Call_F128,
	RTLIB::Libcall Call_PPCF128);
	SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
	RTLIB::Libcall Call_I8,
	RTLIB::Libcall Call_I16,
	RTLIB::Libcall Call_I32,
	RTLIB::Libcall Call_I64,
	RTLIB::Libcall Call_I128);
	void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
	void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);

	SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
	const SDLoc &dl);
	SDValue ExpandBUILD_VECTOR(SDNode *Node);
	SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
	void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
	SmallVectorImpl<SDValue> &Results);
	void getSignAsIntValue(FloatSignAsInt &State, const SDLoc &DL,
	SDValue Value) const;
	SDValue modifySignAsInt(const FloatSignAsInt &State, const SDLoc &DL,
	SDValue NewIntValue) const;
	SDValue ExpandFCOPYSIGN(SDNode *Node) const;
	SDValue ExpandFABS(SDNode *Node) const;
	SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
	const SDLoc &dl);
	SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
	const SDLoc &dl);
	SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
	const SDLoc &dl);

	SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
	SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
	SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);

	SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
	SDValue ExpandInsertToVectorThroughStack(SDValue Op);
	SDValue ExpandVectorBuildThroughStack(SDNode* Node);

	SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
	SDValue ExpandConstant(ConstantSDNode *CP);

	// if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
	bool ExpandNode(SDNode *Node);
	void ConvertNodeToLibcall(SDNode *Node);
	void PromoteNode(SDNode *Node);

	public:
	// Node replacement helpers

	void ReplacedNode(SDNode *N) {
	LegalizedNodes.erase(N);
	if (UpdatedNodes)
	UpdatedNodes->insert(N);
	}

	void ReplaceNode(SDNode Old, SDNode New) {
	DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
	dbgs() << " with: "; New->dump(&DAG));

	assert(Old->getNumValues() == New->getNumValues() &&
	"Replacing one node with another that produces a different number "
	"of values!");
	DAG.ReplaceAllUsesWith(Old, New);
	if (UpdatedNodes)
	UpdatedNodes->insert(New);
	ReplacedNode(Old);
	}

	void ReplaceNode(SDValue Old, SDValue New) {
	DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
	dbgs() << " with: "; New->dump(&DAG));

	DAG.ReplaceAllUsesWith(Old, New);
	if (UpdatedNodes)
	UpdatedNodes->insert(New.getNode());
	ReplacedNode(Old.getNode());
	}

	void ReplaceNode(SDNode Old, const SDValue New) {
	DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));

	DAG.ReplaceAllUsesWith(Old, New);
	for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
	DEBUG(dbgs() << (i == 0 ? " with: "
	: " and: ");
	New[i]->dump(&DAG));
	if (UpdatedNodes)
	UpdatedNodes->insert(New[i].getNode());
	}
	ReplacedNode(Old);
	}
	};

	} // end anonymous namespace

	/// Return a vector shuffle operation which
	/// performs the same shuffe in terms of order or result bytes, but on a type
	/// whose vector element type is narrower than the original shuffle type.
	/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
	SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
	EVT NVT, EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
	ArrayRef<int> Mask) const {
	unsigned NumMaskElts = VT.getVectorNumElements();
	unsigned NumDestElts = NVT.getVectorNumElements();
	unsigned NumEltsGrowth = NumDestElts / NumMaskElts;

	assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");

	if (NumEltsGrowth == 1)
	return DAG.getVectorShuffle(NVT, dl, N1, N2, Mask);

	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int Idx = Mask[i];
	for (unsigned j = 0; j != NumEltsGrowth; ++j) {
	if (Idx < 0)
	NewMask.push_back(-1);
	else
	NewMask.push_back(Idx * NumEltsGrowth + j);
	}
	}
	assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
	assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
	return DAG.getVectorShuffle(NVT, dl, N1, N2, NewMask);
	}

	/// Expands the ConstantFP node to an integer constant or
	/// a load from the constant pool.
	SDValue
	SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
	bool Extend = false;
	SDLoc dl(CFP);

	// If a FP immediate is precise when represented as a float and if the
	// target can do an extending load from float to double, we put it into
	// the constant pool as a float, even if it's is statically typed as a
	// double. This shrinks FP constants and canonicalizes them for targets where
	// an FP extending load is the same cost as a normal load (such as on the x87
	// fp stack or PPC FP unit).
	EVT VT = CFP->getValueType(0);
	ConstantFP LLVMC = const_cast<ConstantFP>(CFP->getConstantFPValue());
	if (!UseCP) {
	assert((VT == MVT::f64 \|\| VT == MVT::f32) && "Invalid type expansion");
	return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(), dl,
	(VT == MVT::f64) ? MVT::i64 : MVT::i32);
	}

	APFloat APF = CFP->getValueAPF();
	EVT OrigVT = VT;
	EVT SVT = VT;

	// We don't want to shrink SNaNs. Converting the SNaN back to its real type
	// can cause it to be changed into a QNaN on some platforms (e.g. on SystemZ).
	if (!APF.isSignaling()) {
	while (SVT != MVT::f32 && SVT != MVT::f16) {
	SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
	if (ConstantFPSDNode::isValueValidForType(SVT, APF) &&
	// Only do this if the target has a native EXTLOAD instruction from
	// smaller type.
	TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
	TLI.ShouldShrinkFPConstant(OrigVT)) {
	Type SType = SVT.getTypeForEVT(DAG.getContext());
	LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
	VT = SVT;
	Extend = true;
	}
	}
	}

	SDValue CPIdx =
	DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	if (Extend) {
	SDValue Result = DAG.getExtLoad(
	ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
	Alignment);
	return Result;
	}
	SDValue Result = DAG.getLoad(
	OrigVT, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
	return Result;
	}

	/// Expands the Constant node to a load from the constant pool.
	SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
	SDLoc dl(CP);
	EVT VT = CP->getValueType(0);
	SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	SDValue Result = DAG.getLoad(
	VT, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
	return Result;
	}

	/// Some target cannot handle a variable insertion index for the
	/// INSERT_VECTOR_ELT instruction. In this case, it
	/// is necessary to spill the vector being inserted into to memory, perform
	/// the insert there, and then read the result back.
	SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
	SDValue Val,
	SDValue Idx,
	const SDLoc &dl) {
	SDValue Tmp1 = Vec;
	SDValue Tmp2 = Val;
	SDValue Tmp3 = Idx;

	// If the target doesn't support this, we have to spill the input vector
	// to a temporary stack slot, update the element, then reload it. This is
	// badness. We could also load the value into a vector register (either
	// with a "move to register" or "extload into register" instruction, then
	// permute it into place, if the idx is a constant and if the idx is
	// supported by the target.
	EVT VT = Tmp1.getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue StackPtr = DAG.CreateStackTemporary(VT);

	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

	// Store the vector.
	SDValue Ch = DAG.getStore(
	DAG.getEntryNode(), dl, Tmp1, StackPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));

	SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);

	// Store the scalar value.
	Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
	// Load the updated vector.
	return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), SPFI));
	}

	SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
	SDValue Idx,
	const SDLoc &dl) {
	if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
	// SCALAR_TO_VECTOR requires that the type of the value being inserted
	// match the element type of the vector being created, except for
	// integers in which case the inserted value can be over width.
	EVT EltVT = Vec.getValueType().getVectorElementType();
	if (Val.getValueType() == EltVT \|\|
	(EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
	SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	Vec.getValueType(), Val);

	unsigned NumElts = Vec.getValueType().getVectorNumElements();
	// We generate a shuffle of InVec and ScVec, so the shuffle mask
	// should be 0,1,2,3,4,5... with the appropriate element replaced with
	// elt 0 of the RHS.
	SmallVector<int, 8> ShufOps;
	for (unsigned i = 0; i != NumElts; ++i)
	ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);

	return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
	}
	}
	return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
	}

	SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
	DEBUG(dbgs() << "Optimizing float store operations\n");
	// Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
	// FIXME: We shouldn't do this for TargetConstantFP's.
	// FIXME: move this to the DAG Combiner! Note that we can't regress due
	// to phase ordering between legalized code and the dag combiner. This
	// probably means that we need to integrate dag combiner and legalizer
	// together.
	// We generally can't do this one for long doubles.
	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();
	SDLoc dl(ST);
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
	if (CFP->getValueType(0) == MVT::f32 &&
	TLI.isTypeLegal(MVT::i32)) {
	SDValue Con = DAG.getConstant(CFP->getValueAPF().
	bitcastToAPInt().zextOrTrunc(32),
	SDLoc(CFP), MVT::i32);
	return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), Alignment,
	MMOFlags, AAInfo);
	}

	if (CFP->getValueType(0) == MVT::f64) {
	// If this target supports 64-bit registers, do a single 64-bit store.
	if (TLI.isTypeLegal(MVT::i64)) {
	SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
	zextOrTrunc(64), SDLoc(CFP), MVT::i64);
	return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
	Alignment, MMOFlags, AAInfo);
	}

	if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
	// Otherwise, if the target supports 32-bit registers, use 2 32-bit
	// stores. If the target supports neither 32- nor 64-bits, this
	// xform is certainly not worth it.
	const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt();
	SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32);
	SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), Alignment,
	MMOFlags, AAInfo);
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(4, dl, Ptr.getValueType()));
	Hi = DAG.getStore(Chain, dl, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(4),
	MinAlign(Alignment, 4U), MMOFlags, AAInfo);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
	}
	}
	}
	return SDValue(nullptr, 0);
	}

	void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
	StoreSDNode *ST = cast<StoreSDNode>(Node);
	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	SDLoc dl(Node);

	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	if (!ST->isTruncatingStore()) {
	DEBUG(dbgs() << "Legalizing store operation\n");
	if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
	ReplaceNode(ST, OptStore);
	return;
	}

	SDValue Value = ST->getValue();
	MVT VT = Value.getSimpleValueType();
	switch (TLI.getOperationAction(ISD::STORE, VT)) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Legal: {
	// If this is an unaligned store and the target doesn't support it,
	// expand it.
	EVT MemVT = ST->getMemoryVT();
	unsigned AS = ST->getAddressSpace();
	unsigned Align = ST->getAlignment();
	const DataLayout &DL = DAG.getDataLayout();
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
	SDValue Result = TLI.expandUnalignedStore(ST, DAG);
	ReplaceNode(SDValue(ST, 0), Result);
	} else
	DEBUG(dbgs() << "Legal store\n");
	break;
	}
	case TargetLowering::Custom: {
	DEBUG(dbgs() << "Trying custom lowering\n");
	SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
	if (Res && Res != SDValue(Node, 0))
	ReplaceNode(SDValue(Node, 0), Res);
	return;
	}
	case TargetLowering::Promote: {
	MVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
	assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
	"Can only promote stores to same size type");
	Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
	SDValue Result =
	DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	Alignment, MMOFlags, AAInfo);
	ReplaceNode(SDValue(Node, 0), Result);
	break;
	}
	}
	return;
	}

	DEBUG(dbgs() << "Legalizing truncating store operations\n");
	SDValue Value = ST->getValue();
	EVT StVT = ST->getMemoryVT();
	unsigned StWidth = StVT.getSizeInBits();
	auto &DL = DAG.getDataLayout();

	if (StWidth != StVT.getStoreSizeInBits()) {
	// Promote to a byte-sized store with upper bits zero if not
	// storing an integral number of bytes. For example, promote
	// TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
	EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
	StVT.getStoreSizeInBits());
	Value = DAG.getZeroExtendInReg(Value, dl, StVT);
	SDValue Result =
	DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
	Alignment, MMOFlags, AAInfo);
	ReplaceNode(SDValue(Node, 0), Result);
	} else if (StWidth & (StWidth - 1)) {
	// If not storing a power-of-2 number of bits, expand as two stores.
	assert(!StVT.isVector() && "Unsupported truncstore!");
	unsigned RoundWidth = 1 << Log2_32(StWidth);
	assert(RoundWidth < StWidth);
	unsigned ExtraWidth = StWidth - RoundWidth;
	assert(ExtraWidth < RoundWidth);
	assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
	"Store size not an integral number of bytes!");
	EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
	EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
	SDValue Lo, Hi;
	unsigned IncrementSize;

	if (DL.isLittleEndian()) {
	// TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
	// Store the bottom RoundWidth bits.
	Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	RoundVT, Alignment, MMOFlags, AAInfo);

	// Store the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Hi = DAG.getNode(
	ISD::SRL, dl, Value.getValueType(), Value,
	DAG.getConstant(RoundWidth, dl,
	TLI.getShiftAmountTy(Value.getValueType(), DL)));
	Hi = DAG.getTruncStore(
	Chain, dl, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
	MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
	} else {
	// Big endian - avoid unaligned stores.
	// TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
	// Store the top RoundWidth bits.
	Hi = DAG.getNode(
	ISD::SRL, dl, Value.getValueType(), Value,
	DAG.getConstant(ExtraWidth, dl,
	TLI.getShiftAmountTy(Value.getValueType(), DL)));
	Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
	RoundVT, Alignment, MMOFlags, AAInfo);

	// Store the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Lo = DAG.getTruncStore(
	Chain, dl, Value, Ptr,
	ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
	MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
	}

	// The order of the stores doesn't matter.
	SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
	ReplaceNode(SDValue(Node, 0), Result);
	} else {
	switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Legal: {
	EVT MemVT = ST->getMemoryVT();
	unsigned AS = ST->getAddressSpace();
	unsigned Align = ST->getAlignment();
	// If this is an unaligned store and the target doesn't support it,
	// expand it.
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	SDValue Result = TLI.expandUnalignedStore(ST, DAG);
	ReplaceNode(SDValue(ST, 0), Result);
	}
	break;
	}
	case TargetLowering::Custom: {
	SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
	if (Res && Res != SDValue(Node, 0))
	ReplaceNode(SDValue(Node, 0), Res);
	return;
	}
	case TargetLowering::Expand:
	assert(!StVT.isVector() &&
	"Vector Stores are handled in LegalizeVectorOps");

	SDValue Result;

	// TRUNCSTORE:i16 i32 -> STORE i16
	if (TLI.isTypeLegal(StVT)) {
	Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
	Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	Alignment, MMOFlags, AAInfo);
	} else {
	// The in-memory type isn't legal. Truncate to the type it would promote
	// to, and then do a truncstore.
	Value = DAG.getNode(ISD::TRUNCATE, dl,
	TLI.getTypeToTransformTo(*DAG.getContext(), StVT),
	Value);
	Result = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	StVT, Alignment, MMOFlags, AAInfo);
	}

	ReplaceNode(SDValue(Node, 0), Result);
	break;
	}
	}
	}

	void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
	LoadSDNode *LD = cast<LoadSDNode>(Node);
	SDValue Chain = LD->getChain(); // The chain.
	SDValue Ptr = LD->getBasePtr(); // The base pointer.
	SDValue Value; // The value returned by the load op.
	SDLoc dl(Node);

	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::NON_EXTLOAD) {
	DEBUG(dbgs() << "Legalizing non-extending load operation\n");
	MVT VT = Node->getSimpleValueType(0);
	SDValue RVal = SDValue(Node, 0);
	SDValue RChain = SDValue(Node, 1);

	switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Legal: {
	EVT MemVT = LD->getMemoryVT();
	unsigned AS = LD->getAddressSpace();
	unsigned Align = LD->getAlignment();
	const DataLayout &DL = DAG.getDataLayout();
	// If this is an unaligned load and the target doesn't support it,
	// expand it.
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	std::tie(RVal, RChain) = TLI.expandUnalignedLoad(LD, DAG);
	}
	break;
	}
	case TargetLowering::Custom:
	if (SDValue Res = TLI.LowerOperation(RVal, DAG)) {
	RVal = Res;
	RChain = Res.getValue(1);
	}
	break;

	case TargetLowering::Promote: {
	MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
	assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
	"Can only promote loads to same size type");

	SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
	RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
	RChain = Res.getValue(1);
	break;
	}
	}
	if (RChain.getNode() != Node) {
	assert(RVal.getNode() != Node && "Load must be completely replaced");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
	if (UpdatedNodes) {
	UpdatedNodes->insert(RVal.getNode());
	UpdatedNodes->insert(RChain.getNode());
	}
	ReplacedNode(Node);
	}
	return;
	}

	DEBUG(dbgs() << "Legalizing extending load operation\n");
	EVT SrcVT = LD->getMemoryVT();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	unsigned Alignment = LD->getAlignment();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	if (SrcWidth != SrcVT.getStoreSizeInBits() &&
	// Some targets pretend to have an i1 loading operation, and actually
	// load an i8. This trick is correct for ZEXTLOAD because the top 7
	// bits are guaranteed to be zero; it helps the optimizers understand
	// that these bits are zero. It is also useful for EXTLOAD, since it
	// tells the optimizers that those bits are undefined. It would be
	// nice to have an effective generic way of getting these benefits...
	// Until such a way is found, don't insist on promoting i1 here.
	(SrcVT != MVT::i1 \|\|
	TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
	TargetLowering::Promote)) {
	// Promote to a byte-sized load if not loading an integral number of
	// bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
	unsigned NewWidth = SrcVT.getStoreSizeInBits();
	EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
	SDValue Ch;

	// The extra bits are guaranteed to be zero, since we stored them that
	// way. A zext load from NVT thus automatically gives zext from SrcVT.

	ISD::LoadExtType NewExtType =
	ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;

	SDValue Result =
	DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), NVT, Alignment, MMOFlags, AAInfo);

	Ch = Result.getValue(1); // The chain.

	if (ExtType == ISD::SEXTLOAD)
	// Having the top bits zero doesn't help when sign extending.
	Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
	Result.getValueType(),
	Result, DAG.getValueType(SrcVT));
	else if (ExtType == ISD::ZEXTLOAD \|\| NVT == Result.getValueType())
	// All the top bits are guaranteed to be zero - inform the optimizers.
	Result = DAG.getNode(ISD::AssertZext, dl,
	Result.getValueType(), Result,
	DAG.getValueType(SrcVT));

	Value = Result;
	Chain = Ch;
	} else if (SrcWidth & (SrcWidth - 1)) {
	// If not loading a power-of-2 number of bits, expand as two loads.
	assert(!SrcVT.isVector() && "Unsupported extload!");
	unsigned RoundWidth = 1 << Log2_32(SrcWidth);
	assert(RoundWidth < SrcWidth);
	unsigned ExtraWidth = SrcWidth - RoundWidth;
	assert(ExtraWidth < RoundWidth);
	assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
	"Load size not an integral number of bytes!");
	EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
	EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
	SDValue Lo, Hi, Ch;
	unsigned IncrementSize;
	auto &DL = DAG.getDataLayout();

	if (DL.isLittleEndian()) {
	// EXTLOAD:i24 -> ZEXTLOAD:i16 \| (shl EXTLOAD@+2:i8, 16)
	// Load the bottom RoundWidth bits.
	Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
	AAInfo);

	// Load the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo().getWithOffset(IncrementSize),
	ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
	AAInfo);

	// Build a factor node to remember that this load is independent of
	// the other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Move the top bits to the right place.
	Hi = DAG.getNode(
	ISD::SHL, dl, Hi.getValueType(), Hi,
	DAG.getConstant(RoundWidth, dl,
	TLI.getShiftAmountTy(Hi.getValueType(), DL)));

	// Join the hi and lo parts.
	Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
	} else {
	// Big endian - avoid unaligned loads.
	// EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) \| ZEXTLOAD@+2:i8
	// Load the top RoundWidth bits.
	Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
	AAInfo);

	// Load the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo().getWithOffset(IncrementSize),
	ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
	AAInfo);

	// Build a factor node to remember that this load is independent of
	// the other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Move the top bits to the right place.
	Hi = DAG.getNode(
	ISD::SHL, dl, Hi.getValueType(), Hi,
	DAG.getConstant(ExtraWidth, dl,
	TLI.getShiftAmountTy(Hi.getValueType(), DL)));

	// Join the hi and lo parts.
	Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
	}

	Chain = Ch;
	} else {
	bool isCustom = false;
	switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
	SrcVT.getSimpleVT())) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Custom:
	isCustom = true;
	LLVM_FALLTHROUGH;
	case TargetLowering::Legal:
	Value = SDValue(Node, 0);
	Chain = SDValue(Node, 1);

	if (isCustom) {
	if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
	Value = Res;
	Chain = Res.getValue(1);
	}
	} else {
	// If this is an unaligned load and the target doesn't support it,
	// expand it.
	EVT MemVT = LD->getMemoryVT();
	unsigned AS = LD->getAddressSpace();
	unsigned Align = LD->getAlignment();
	const DataLayout &DL = DAG.getDataLayout();
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	std::tie(Value, Chain) = TLI.expandUnalignedLoad(LD, DAG);
	}
	}
	break;

	case TargetLowering::Expand: {
	EVT DestVT = Node->getValueType(0);
	if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
	// If the source type is not legal, see if there is a legal extload to
	// an intermediate type that we can then extend further.
	EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
	if (TLI.isTypeLegal(SrcVT) \|\| // Same as SrcVT == LoadVT?
	TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
	// If we are loading a legal type, this is a non-extload followed by a
	// full extend.
	ISD::LoadExtType MidExtType =
	(LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;

	SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
	SrcVT, LD->getMemOperand());
	unsigned ExtendOp =
	ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
	Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
	Chain = Load.getValue(1);
	break;
	}

	// Handle the special case of fp16 extloads. EXTLOAD doesn't have the
	// normal undefined upper bits behavior to allow using an in-reg extend
	// with the illegal FP type, so load as an integer and do the
	// from-integer conversion.
	if (SrcVT.getScalarType() == MVT::f16) {
	EVT ISrcVT = SrcVT.changeTypeToInteger();
	EVT IDestVT = DestVT.changeTypeToInteger();
	EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());

	SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
	Chain, Ptr, ISrcVT,
	LD->getMemOperand());
	Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
	Chain = Result.getValue(1);
	break;
	}
	}

	assert(!SrcVT.isVector() &&
	"Vector Loads are handled in LegalizeVectorOps");

	// FIXME: This does not work for vectors on most targets. Sign-
	// and zero-extend operations are currently folded into extending
	// loads, whether they are legal or not, and then we end up here
	// without any support for legalizing them.
	assert(ExtType != ISD::EXTLOAD &&
	"EXTLOAD should always be supported!");
	// Turn the unsupported load into an EXTLOAD followed by an
	// explicit zero/sign extend inreg.
	SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl,
	Node->getValueType(0),
	Chain, Ptr, SrcVT,
	LD->getMemOperand());
	SDValue ValRes;
	if (ExtType == ISD::SEXTLOAD)
	ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
	Result.getValueType(),
	Result, DAG.getValueType(SrcVT));
	else
	ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
	Value = ValRes;
	Chain = Result.getValue(1);
	break;
	}
	}
	}

	// Since loads produce two values, make sure to remember that we legalized
	// both of them.
	if (Chain.getNode() != Node) {
	assert(Value.getNode() != Node && "Load must be completely replaced");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
	if (UpdatedNodes) {
	UpdatedNodes->insert(Value.getNode());
	UpdatedNodes->insert(Chain.getNode());
	}
	ReplacedNode(Node);
	}
	}

	static TargetLowering::LegalizeAction
	getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) {
	unsigned EqOpc;
	switch (Opcode) {
	default: llvm_unreachable("Unexpected FP pseudo-opcode");
	case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
	case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
	case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
	case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
	case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
	case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
	case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
	case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
	case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
	case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
	case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
	case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
	case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
	}

	auto Action = TLI.getOperationAction(EqOpc, VT);

	// We don't currently handle Custom or Promote for strict FP pseudo-ops.
	// For now, we just expand for those cases.
	if (Action != TargetLowering::Legal)
	Action = TargetLowering::Expand;

	return Action;
	}

	/// Return a legal replacement for the given operation, with all legal operands.
	void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
	DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));

	// Allow illegal target nodes and illegal registers.
	if (Node->getOpcode() == ISD::TargetConstant \|\|
	Node->getOpcode() == ISD::Register)
	return;

	#ifndef NDEBUG
	for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
	assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
	TargetLowering::TypeLegal \|\|
	TLI.isTypeLegal(Node->getValueType(i))) &&
	"Unexpected illegal type!");

	for (const SDValue &Op : Node->op_values())
	assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
	TargetLowering::TypeLegal \|\|
	TLI.isTypeLegal(Op.getValueType()) \|\|
	Op.getOpcode() == ISD::TargetConstant \|\|
	Op.getOpcode() == ISD::Register) &&
	"Unexpected illegal type!");
	#endif

	// Figure out the correct action; the way to query this varies by opcode
	TargetLowering::LegalizeAction Action = TargetLowering::Legal;
	bool SimpleFinishLegalizing = true;
	switch (Node->getOpcode()) {
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID:
	case ISD::STACKSAVE:
	Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
	break;
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getValueType(0));
	break;
	case ISD::VAARG:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getValueType(0));
	if (Action != TargetLowering::Promote)
	Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
	break;
	case ISD::FP_TO_FP16:
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::EXTRACT_VECTOR_ELT:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getOperand(0).getValueType());
	break;
	case ISD::FP_ROUND_INREG:
	case ISD::SIGN_EXTEND_INREG: {
	EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
	Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
	break;
	}
	case ISD::ATOMIC_STORE:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getOperand(2).getValueType());
	break;
	case ISD::SELECT_CC:
	case ISD::SETCC:
	case ISD::BR_CC: {
	unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
	Node->getOpcode() == ISD::SETCC ? 2 :
	Node->getOpcode() == ISD::SETCCE ? 3 : 1;
	unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
	MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
	ISD::CondCode CCCode =
	cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
	Action = TLI.getCondCodeAction(CCCode, OpVT);
	if (Action == TargetLowering::Legal) {
	if (Node->getOpcode() == ISD::SELECT_CC)
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getValueType(0));
	else
	Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
	}
	break;
	}
	case ISD::LOAD:
	case ISD::STORE:
	// FIXME: Model these properly. LOAD and STORE are complicated, and
	// STORE expects the unlegalized operand in some cases.
	SimpleFinishLegalizing = false;
	break;
	case ISD::CALLSEQ_START:
	case ISD::CALLSEQ_END:
	// FIXME: This shouldn't be necessary. These nodes have special properties
	// dealing with the recursive nature of legalization. Removing this
	// special case should be done as part of making LegalizeDAG non-recursive.
	SimpleFinishLegalizing = false;
	break;
	case ISD::EXTRACT_ELEMENT:
	case ISD::FLT_ROUNDS_:
	case ISD::MERGE_VALUES:
	case ISD::EH_RETURN:
	case ISD::FRAME_TO_ARGS_OFFSET:
	case ISD::EH_DWARF_CFA:
	case ISD::EH_SJLJ_SETJMP:
	case ISD::EH_SJLJ_LONGJMP:
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	// These operations lie about being legal: when they claim to be legal,
	// they should actually be expanded.
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	if (Action == TargetLowering::Legal)
	Action = TargetLowering::Expand;
	break;
	case ISD::INIT_TRAMPOLINE:
	case ISD::ADJUST_TRAMPOLINE:
	case ISD::FRAMEADDR:
	case ISD::RETURNADDR:
	case ISD::ADDROFRETURNADDR:
	// These operations lie about being legal: when they claim to be legal,
	// they should actually be custom-lowered.
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	if (Action == TargetLowering::Legal)
	Action = TargetLowering::Custom;
	break;
	case ISD::READCYCLECOUNTER:
	// READCYCLECOUNTER returns an i64, even if type legalization might have
	// expanded that to several smaller types.
	Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
	break;
	case ISD::READ_REGISTER:
	case ISD::WRITE_REGISTER:
	// Named register is legal in the DAG, but blocked by register name
	// selection if not implemented by target (to chose the correct register)
	// They'll be converted to Copy(To/From)Reg.
	Action = TargetLowering::Legal;
	break;
	case ISD::DEBUGTRAP:
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	if (Action == TargetLowering::Expand) {
	// replace ISD::DEBUGTRAP with ISD::TRAP
	SDValue NewVal;
	NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
	Node->getOperand(0));
	ReplaceNode(Node, NewVal.getNode());
	LegalizeOp(NewVal.getNode());
	return;
	}
	break;
	case ISD::STRICT_FSQRT:
	case ISD::STRICT_FMA:
	case ISD::STRICT_FPOW:
	case ISD::STRICT_FPOWI:
	case ISD::STRICT_FSIN:
	case ISD::STRICT_FCOS:
	case ISD::STRICT_FEXP:
	case ISD::STRICT_FEXP2:
	case ISD::STRICT_FLOG:
	case ISD::STRICT_FLOG10:
	case ISD::STRICT_FLOG2:
	case ISD::STRICT_FRINT:
	case ISD::STRICT_FNEARBYINT:
	// These pseudo-ops get legalized as if they were their non-strict
	// equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
	// is also legal, but if ISD::FSQRT requires expansion then so does
	// ISD::STRICT_FSQRT.
	Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(),
	Node->getValueType(0));
	break;
	default:
	if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
	Action = TargetLowering::Legal;
	} else {
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	}
	break;
	}

	if (SimpleFinishLegalizing) {
	SDNode *NewNode = Node;
	switch (Node->getOpcode()) {
	default: break;
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	case ISD::ROTL:
	case ISD::ROTR: {
	// Legalizing shifts/rotates requires adjusting the shift amount
	// to the appropriate width.
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	if (!Op1.getValueType().isVector()) {
	SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op1);
	// The getShiftAmountOperand() may create a new operand node or
	// return the existing one. If new operand is created we need
	// to update the parent node.
	// Do not try to legalize SAO here! It will be automatically legalized
	// in the next round.
	if (SAO != Op1)
	NewNode = DAG.UpdateNodeOperands(Node, Op0, SAO);
	}
	}
	break;
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SHL_PARTS: {
	// Legalizing shifts/rotates requires adjusting the shift amount
	// to the appropriate width.
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	SDValue Op2 = Node->getOperand(2);
	if (!Op2.getValueType().isVector()) {
	SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op2);
	// The getShiftAmountOperand() may create a new operand node or
	// return the existing one. If new operand is created we need
	// to update the parent node.
	if (SAO != Op2)
	NewNode = DAG.UpdateNodeOperands(Node, Op0, Op1, SAO);
	}
	break;
	}
	}

	if (NewNode != Node) {
	ReplaceNode(Node, NewNode);
	Node = NewNode;
	}
	switch (Action) {
	case TargetLowering::Legal:
	DEBUG(dbgs() << "Legal node: nothing to do\n");
	return;
	case TargetLowering::Custom:
	DEBUG(dbgs() << "Trying custom legalization\n");
	// FIXME: The handling for custom lowering with multiple results is
	// a complete mess.
	if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
	if (!(Res.getNode() != Node \|\| Res.getResNo() != 0))
	return;

	if (Node->getNumValues() == 1) {
	DEBUG(dbgs() << "Successfully custom legalized node\n");
	// We can just directly replace this node with the lowered value.
	ReplaceNode(SDValue(Node, 0), Res);
	return;
	}

	SmallVector<SDValue, 8> ResultVals;
	for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
	ResultVals.push_back(Res.getValue(i));
	DEBUG(dbgs() << "Successfully custom legalized node\n");
	ReplaceNode(Node, ResultVals.data());
	return;
	}
	DEBUG(dbgs() << "Could not custom legalize node\n");
	LLVM_FALLTHROUGH;
	case TargetLowering::Expand:
	if (ExpandNode(Node))
	return;
	LLVM_FALLTHROUGH;
	case TargetLowering::LibCall:
	ConvertNodeToLibcall(Node);
	return;
	case TargetLowering::Promote:
	PromoteNode(Node);
	return;
	}
	}

	switch (Node->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "NODE: ";
	Node->dump( &DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to legalize this operator!");

	case ISD::CALLSEQ_START:
	case ISD::CALLSEQ_END:
	break;
	case ISD::LOAD:
	return LegalizeLoadOps(Node);
	case ISD::STORE:
	return LegalizeStoreOps(Node);
	}
	}

	SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	SDLoc dl(Op);

	// Before we generate a new store to a temporary stack slot, see if there is
	// already one that we can use. There often is because when we scalarize
	// vector operations (using SelectionDAG::UnrollVectorOp for example) a whole
	// series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
	// the vector. If all are expanded here, we don't want one store per vector
	// element.

	// Caches for hasPredecessorHelper
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(Idx.getNode());
	SDValue StackPtr, Ch;
	for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
	UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
	if (ST->isIndexed() \|\| ST->isTruncatingStore() \|\|
	ST->getValue() != Vec)
	continue;

	// Make sure that nothing else could have stored into the destination of
	// this store.
	if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
	continue;

	// If the index is dependent on the store we will introduce a cycle when
	// creating the load (the load uses the index, and by replacing the chain
	// we will make the index dependent on the load). Also, the store might be
	// dependent on the extractelement and introduce a cycle when creating
	// the load.
	if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) \|\|
	ST->hasPredecessor(Op.getNode()))
	continue;

	StackPtr = ST->getBasePtr();
	Ch = SDValue(ST, 0);
	break;
	}
	}

	EVT VecVT = Vec.getValueType();

	if (!Ch.getNode()) {
	// Store the value to a temporary stack slot, then LOAD the returned part.
	StackPtr = DAG.CreateStackTemporary(VecVT);
	Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
	MachinePointerInfo());
	}

	StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);

	SDValue NewLoad;

	if (Op.getValueType().isVector())
	NewLoad =
	DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
	else
	NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
	MachinePointerInfo(),
	VecVT.getVectorElementType());

	// Replace the chain going out of the store, by the one out of the load.
	DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));

	// We introduced a cycle though, so update the loads operands, making sure
	// to use the original store's chain as an incoming chain.
	SmallVector<SDValue, 6> NewLoadOperands(NewLoad->op_begin(),
	NewLoad->op_end());
	NewLoadOperands[0] = Ch;
	NewLoad =
	SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0);
	return NewLoad;
	}

	SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
	assert(Op.getValueType().isVector() && "Non-vector insert subvector!");

	SDValue Vec = Op.getOperand(0);
	SDValue Part = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	SDLoc dl(Op);

	// Store the value to a temporary stack slot, then LOAD the returned part.
	EVT VecVT = Vec.getValueType();
	SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
	int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// First store the whole vector.
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);

	// Then store the inserted part.
	SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);

	// Store the subvector.
	Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());

	// Finally, load the updated vector.
	return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
	}

	SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
	// We can't handle this case efficiently. Allocate a sufficiently
	// aligned object on the stack, store each element into it, then load
	// the result as a vector.
	// Create the stack frame object.
	EVT VT = Node->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(Node);
	SDValue FIPtr = DAG.CreateStackTemporary(VT);
	int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Emit a store of each element to the stack slot.
	SmallVector<SDValue, 8> Stores;
	unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
	// Store (in the right endianness) the elements to memory.
	for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
	// Ignore undef elements.
	if (Node->getOperand(i).isUndef()) continue;

	unsigned Offset = TypeByteSize*i;

	SDValue Idx = DAG.getConstant(Offset, dl, FIPtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);

	// If the destination vector element type is narrower than the source
	// element type, only store the bits necessary.
	if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
	Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
	Node->getOperand(i), Idx,
	PtrInfo.getWithOffset(Offset), EltVT));
	} else
	Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
	Idx, PtrInfo.getWithOffset(Offset)));
	}

	SDValue StoreChain;
	if (!Stores.empty()) // Not all undef elements?
	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
	else
	StoreChain = DAG.getEntryNode();

	// Result is a load from the stack slot.
	return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo);
	}

	/// Bitcast a floating-point value to an integer value. Only bitcast the part
	/// containing the sign bit if the target has no integer value capable of
	/// holding all bits of the floating-point value.
	void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
	const SDLoc &DL,
	SDValue Value) const {
	EVT FloatVT = Value.getValueType();
	unsigned NumBits = FloatVT.getSizeInBits();
	State.FloatVT = FloatVT;
	EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
	// Convert to an integer of the same size.
	if (TLI.isTypeLegal(IVT)) {
	State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
	State.SignMask = APInt::getSignMask(NumBits);
	State.SignBit = NumBits - 1;
	return;
	}

	auto &DataLayout = DAG.getDataLayout();
	// Store the float to memory, then load the sign part out as an integer.
	MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8);
	// First create a temporary that is aligned for both the load and store.
	SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
	int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	// Then store the float to it.
	State.FloatPtr = StackPtr;
	MachineFunction &MF = DAG.getMachineFunction();
	State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
	State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
	State.FloatPointerInfo);

	SDValue IntPtr;
	if (DataLayout.isBigEndian()) {
	assert(FloatVT.isByteSized() && "Unsupported floating point type!");
	// Load out a legal integer with the same sign bit as the float.
	IntPtr = StackPtr;
	State.IntPointerInfo = State.FloatPointerInfo;
	} else {
	// Advance the pointer so that the loaded byte will contain the sign bit.
	unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1;
	IntPtr = DAG.getNode(ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
	DAG.getConstant(ByteOffset, DL, StackPtr.getValueType()));
	State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
	ByteOffset);
	}

	State.IntPtr = IntPtr;
	State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
	State.IntPointerInfo, MVT::i8);
	State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
	State.SignBit = 7;
	}

	/// Replace the integer value produced by getSignAsIntValue() with a new value
	/// and cast the result back to a floating-point type.
	SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
	const SDLoc &DL,
	SDValue NewIntValue) const {
	if (!State.Chain)
	return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);

	// Override the part containing the sign bit in the value stored on the stack.
	SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
	State.IntPointerInfo, MVT::i8);
	return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
	State.FloatPointerInfo);
	}

	SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
	SDLoc DL(Node);
	SDValue Mag = Node->getOperand(0);
	SDValue Sign = Node->getOperand(1);

	// Get sign bit into an integer value.
	FloatSignAsInt SignAsInt;
	getSignAsIntValue(SignAsInt, DL, Sign);

	EVT IntVT = SignAsInt.IntValue.getValueType();
	SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
	SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue,
	SignMask);

	// If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X)
	EVT FloatVT = Mag.getValueType();
	if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) &&
	TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) {
	SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag);
	SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue);
	SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit,
	DAG.getConstant(0, DL, IntVT), ISD::SETNE);
	return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue);
	}

	// Transform Mag value to integer, and clear the sign bit.
	FloatSignAsInt MagAsInt;
	getSignAsIntValue(MagAsInt, DL, Mag);
	EVT MagVT = MagAsInt.IntValue.getValueType();
	SDValue ClearSignMask = DAG.getConstant(~MagAsInt.SignMask, DL, MagVT);
	SDValue ClearedSign = DAG.getNode(ISD::AND, DL, MagVT, MagAsInt.IntValue,
	ClearSignMask);

	// Get the signbit at the right position for MagAsInt.
	int ShiftAmount = SignAsInt.SignBit - MagAsInt.SignBit;
	if (SignBit.getValueSizeInBits() > ClearedSign.getValueSizeInBits()) {
	if (ShiftAmount > 0) {
	SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, IntVT);
	SignBit = DAG.getNode(ISD::SRL, DL, IntVT, SignBit, ShiftCnst);
	} else if (ShiftAmount < 0) {
	SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, IntVT);
	SignBit = DAG.getNode(ISD::SHL, DL, IntVT, SignBit, ShiftCnst);
	}
	SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
	} else if (SignBit.getValueSizeInBits() < ClearedSign.getValueSizeInBits()) {
	SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
	if (ShiftAmount > 0) {
	SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, MagVT);
	SignBit = DAG.getNode(ISD::SRL, DL, MagVT, SignBit, ShiftCnst);
	} else if (ShiftAmount < 0) {
	SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, MagVT);
	SignBit = DAG.getNode(ISD::SHL, DL, MagVT, SignBit, ShiftCnst);
	}
	}

	// Store the part with the modified sign and convert back to float.
	SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit);
	return modifySignAsInt(MagAsInt, DL, CopiedSign);
	}

	SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
	SDLoc DL(Node);
	SDValue Value = Node->getOperand(0);

	// Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal.
	EVT FloatVT = Value.getValueType();
	if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT);
	return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero);
	}

	// Transform value to integer, clear the sign bit and transform back.
	FloatSignAsInt ValueAsInt;
	getSignAsIntValue(ValueAsInt, DL, Value);
	EVT IntVT = ValueAsInt.IntValue.getValueType();
	SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT);
	SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue,
	ClearSignMask);
	return modifySignAsInt(ValueAsInt, DL, ClearedSign);
	}

	void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
	SmallVectorImpl<SDValue> &Results) {
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	SDValue Tmp1 = SDValue(Node, 0);
	SDValue Tmp2 = SDValue(Node, 1);
	SDValue Tmp3 = Node->getOperand(2);
	SDValue Chain = Tmp1.getOperand(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	SDValue Size = Tmp2.getOperand(1);
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
	unsigned StackAlign =
	DAG.getSubtarget().getFrameLowering()->getStackAlignment();
	Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain

	Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	Results.push_back(Tmp1);
	Results.push_back(Tmp2);
	}

	/// Legalize a SETCC with given LHS and RHS and condition code CC on the current
	/// target.
	///
	/// If the SETCC has been legalized using AND / OR, then the legalized node
	/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
	/// will be set to false.
	///
	/// If the SETCC has been legalized by using getSetCCSwappedOperands(),
	/// then the values of LHS and RHS will be swapped, CC will be set to the
	/// new condition, and NeedInvert will be set to false.
	///
	/// If the SETCC has been legalized using the inverse condcode, then LHS and
	/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
	/// will be set to true. The caller must invert the result of the SETCC with
	/// SelectionDAG::getLogicalNOT() or take equivalent action to swap the effect
	/// of a true/false result.
	///
	/// \returns true if the SetCC has been legalized, false if it hasn't.
	bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
	SDValue &RHS, SDValue &CC,
	bool &NeedInvert,
	const SDLoc &dl) {
	MVT OpVT = LHS.getSimpleValueType();
	ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
	NeedInvert = false;
	switch (TLI.getCondCodeAction(CCCode, OpVT)) {
	default: llvm_unreachable("Unknown condition code action!");
	case TargetLowering::Legal:
	// Nothing to do.
	break;
	case TargetLowering::Expand: {
	ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
	if (TLI.isCondCodeLegal(InvCC, OpVT)) {
	std::swap(LHS, RHS);
	CC = DAG.getCondCode(InvCC);
	return true;
	}
	ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
	unsigned Opc = 0;
	switch (CCCode) {
	default: llvm_unreachable("Don't know how to expand this condition!");
	case ISD::SETO:
	assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
	== TargetLowering::Legal
	&& "If SETO is expanded, SETOEQ must be legal!");
	CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
	case ISD::SETUO:
	assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
	== TargetLowering::Legal
	&& "If SETUO is expanded, SETUNE must be legal!");
	CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR; break;
	case ISD::SETOEQ:
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETONE:
	case ISD::SETUEQ:
	case ISD::SETUNE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	case ISD::SETULT:
	case ISD::SETULE:
	// If we are floating point, assign and break, otherwise fall through.
	if (!OpVT.isInteger()) {
	// We can use the 4th bit to tell if we are the unordered
	// or ordered version of the opcode.
	CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
	Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
	CC1 = (ISD::CondCode)(((int)CCCode & 0x7) \| 0x10);
	break;
	}
	// Fallthrough if we are unsigned integer.
	LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETGT:
	case ISD::SETGE:
	case ISD::SETLT:
	// We only support using the inverted operation, which is computed above
	// and not a different manner of supporting expanding these cases.
	llvm_unreachable("Don't know how to expand this condition!");
	case ISD::SETNE:
	case ISD::SETEQ:
	// Try inverting the result of the inverse condition.
	InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
	if (TLI.isCondCodeLegal(InvCC, OpVT)) {
	CC = DAG.getCondCode(InvCC);
	NeedInvert = true;
	return true;
	}
	// If inverting the condition didn't work then we have no means to expand
	// the condition.
	llvm_unreachable("Don't know how to expand this condition!");
	}

	SDValue SetCC1, SetCC2;
	if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
	// If we aren't the ordered or unorder operation,
	// then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
	SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
	SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
	} else {
	// Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
	SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1);
	SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2);
	}
	LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
	RHS = SDValue();
	CC = SDValue();
	return true;
	}
	}
	return false;
	}

	/// Emit a store/load combination to the stack. This stores
	/// SrcOp to a stack slot of type SlotVT, truncating it if needed. It then does
	/// a load from the stack slot to DestVT, extending it if needed.
	/// The resultant code need not be legal.
	SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
	EVT DestVT, const SDLoc &dl) {
	// Create the stack frame object.
	unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
	SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
	SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);

	FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
	int SPFI = StackPtrFI->getIndex();
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

	unsigned SrcSize = SrcOp.getValueSizeInBits();
	unsigned SlotSize = SlotVT.getSizeInBits();
	unsigned DestSize = DestVT.getSizeInBits();
	Type DestType = DestVT.getTypeForEVT(DAG.getContext());
	unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType);

	// Emit a store to the stack slot. Use a truncstore if the input value is
	// later than DestVT.
	SDValue Store;

	if (SrcSize > SlotSize)
	Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo,
	SlotVT, SrcAlign);
	else {
	assert(SrcSize == SlotSize && "Invalid store");
	Store =
	DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
	}

	// Result is a load from the stack slot.
	if (SlotSize == DestSize)
	return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);

	assert(SlotSize < DestSize && "Unknown extension!");
	return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
	DestAlign);
	}

	SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
	SDLoc dl(Node);
	// Create a vector sized/aligned stack slot, store the value to element #0,
	// then load the whole vector back out.
	SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));

	FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
	int SPFI = StackPtrFI->getIndex();

	SDValue Ch = DAG.getTruncStore(
	DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
	Node->getValueType(0).getVectorElementType());
	return DAG.getLoad(
	Node->getValueType(0), dl, Ch, StackPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
	}

	static bool
	ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
	const TargetLowering &TLI, SDValue &Res) {
	unsigned NumElems = Node->getNumOperands();
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);

	// Try to group the scalars into pairs, shuffle the pairs together, then
	// shuffle the pairs of pairs together, etc. until the vector has
	// been built. This will work only if all of the necessary shuffle masks
	// are legal.

	// We do this in two phases; first to check the legality of the shuffles,
	// and next, assuming that all shuffles are legal, to create the new nodes.
	for (int Phase = 0; Phase < 2; ++Phase) {
	SmallVector<std::pair<SDValue, SmallVector<int, 16>>, 16> IntermedVals,
	NewIntermedVals;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue V = Node->getOperand(i);
	if (V.isUndef())
	continue;

	SDValue Vec;
	if (Phase)
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, V);
	IntermedVals.push_back(std::make_pair(Vec, SmallVector<int, 16>(1, i)));
	}

	while (IntermedVals.size() > 2) {
	NewIntermedVals.clear();
	for (unsigned i = 0, e = (IntermedVals.size() & ~1u); i < e; i += 2) {
	// This vector and the next vector are shuffled together (simply to
	// append the one to the other).
	SmallVector<int, 16> ShuffleVec(NumElems, -1);

	SmallVector<int, 16> FinalIndices;
	FinalIndices.reserve(IntermedVals[i].second.size() +
	IntermedVals[i+1].second.size());

	int k = 0;
	for (unsigned j = 0, f = IntermedVals[i].second.size(); j != f;
	++j, ++k) {
	ShuffleVec[k] = j;
	FinalIndices.push_back(IntermedVals[i].second[j]);
	}
	for (unsigned j = 0, f = IntermedVals[i+1].second.size(); j != f;
	++j, ++k) {
	ShuffleVec[k] = NumElems + j;
	FinalIndices.push_back(IntermedVals[i+1].second[j]);
	}

	SDValue Shuffle;
	if (Phase)
	Shuffle = DAG.getVectorShuffle(VT, dl, IntermedVals[i].first,
	IntermedVals[i+1].first,
	ShuffleVec);
	else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
	return false;
	NewIntermedVals.push_back(
	std::make_pair(Shuffle, std::move(FinalIndices)));
	}

	// If we had an odd number of defined values, then append the last
	// element to the array of new vectors.
	if ((IntermedVals.size() & 1) != 0)
	NewIntermedVals.push_back(IntermedVals.back());

	IntermedVals.swap(NewIntermedVals);
	}

	assert(IntermedVals.size() <= 2 && IntermedVals.size() > 0 &&
	"Invalid number of intermediate vectors");
	SDValue Vec1 = IntermedVals[0].first;
	SDValue Vec2;
	if (IntermedVals.size() > 1)
	Vec2 = IntermedVals[1].first;
	else if (Phase)
	Vec2 = DAG.getUNDEF(VT);

	SmallVector<int, 16> ShuffleVec(NumElems, -1);
	for (unsigned i = 0, e = IntermedVals[0].second.size(); i != e; ++i)
	ShuffleVec[IntermedVals[0].second[i]] = i;
	for (unsigned i = 0, e = IntermedVals[1].second.size(); i != e; ++i)
	ShuffleVec[IntermedVals[1].second[i]] = NumElems + i;

	if (Phase)
	Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
	else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
	return false;
	}

	return true;
	}

	/// Expand a BUILD_VECTOR node on targets that don't
	/// support the operation, but do support the resultant vector type.
	SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
	unsigned NumElems = Node->getNumOperands();
	SDValue Value1, Value2;
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	EVT OpVT = Node->getOperand(0).getValueType();
	EVT EltVT = VT.getVectorElementType();

	// If the only non-undef value is the low element, turn this into a
	// SCALAR_TO_VECTOR node. If this is { X, X, X, X }, determine X.
	bool isOnlyLowElement = true;
	bool MoreThanTwoValues = false;
	bool isConstant = true;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue V = Node->getOperand(i);
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (!Value1.getNode()) {
	Value1 = V;
	} else if (!Value2.getNode()) {
	if (V != Value1)
	Value2 = V;
	} else if (V != Value1 && V != Value2) {
	MoreThanTwoValues = true;
	}
	}

	if (!Value1.getNode())
	return DAG.getUNDEF(VT);

	if (isOnlyLowElement)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));

	// If all elements are constants, create a load from the constant pool.
	if (isConstant) {
	SmallVector<Constant*, 16> CV;
	for (unsigned i = 0, e = NumElems; i != e; ++i) {
	if (ConstantFPSDNode *V =
	dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
	CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
	} else if (ConstantSDNode *V =
	dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
	if (OpVT==EltVT)
	CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
	else {
	// If OpVT and EltVT don't match, EltVT is not legal and the
	// element values have been promoted/truncated earlier. Undo this;
	// we don't want a v16i8 to become a v16i32 for example.
	const ConstantInt *CI = V->getConstantIntValue();
	CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
	CI->getZExtValue()));
	}
	} else {
	assert(Node->getOperand(i).isUndef());
	Type OpNTy = EltVT.getTypeForEVT(DAG.getContext());
	CV.push_back(UndefValue::get(OpNTy));
	}
	}
	Constant *CP = ConstantVector::get(CV);
	SDValue CPIdx =
	DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	return DAG.getLoad(
	VT, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	}

	SmallSet<SDValue, 16> DefinedValues;
	for (unsigned i = 0; i < NumElems; ++i) {
	if (Node->getOperand(i).isUndef())
	continue;
	DefinedValues.insert(Node->getOperand(i));
	}

	if (TLI.shouldExpandBuildVectorWithShuffles(VT, DefinedValues.size())) {
	if (!MoreThanTwoValues) {
	SmallVector<int, 8> ShuffleVec(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue V = Node->getOperand(i);
	if (V.isUndef())
	continue;
	ShuffleVec[i] = V == Value1 ? 0 : NumElems;
	}
	if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
	// Get the splatted value into the low element of a vector register.
	SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
	SDValue Vec2;
	if (Value2.getNode())
	Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
	else
	Vec2 = DAG.getUNDEF(VT);

	// Return shuffle(LowValVec, undef, <0,0,0,0>)
	return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
	}
	} else {
	SDValue Res;
	if (ExpandBVWithShuffles(Node, DAG, TLI, Res))
	return Res;
	}
	}

	// Otherwise, we can't handle this case efficiently.
	return ExpandVectorBuildThroughStack(Node);
	}

	// Expand a node into a call to a libcall. If the result value
	// does not fit into a register, return the lo part and set the hi part to the
	// by-reg argument. If it does fit into a single register, return the result
	// and leave the Hi part unset.
	SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
	bool isSigned) {
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (const SDValue &Op : Node->op_values()) {
	EVT ArgVT = Op.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Node = Op;
	Entry.Ty = ArgTy;
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}
	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = Node->getValueType(0).getTypeForEVT(DAG.getContext());

	// By default, the input chain to this libcall is the entry node of the
	// function. If the libcall is going to be emitted as a tail call then
	// TLI.isUsedByReturnOnly will change it to the right chain if the return
	// node which is being folded has a non-entry input chain.
	SDValue InChain = DAG.getEntryNode();

	// isTailCall may be true since the callee does not reference caller stack
	// frame. Check if it's in the right position and that the return types match.
	SDValue TCChain = InChain;
	const Function &F = DAG.getMachineFunction().getFunction();
	bool isTailCall =
	TLI.isInTailCallPosition(DAG, Node, TCChain) &&
	(RetTy == F.getReturnType() \|\| F.getReturnType()->isVoidTy());
	if (isTailCall)
	InChain = TCChain;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(SDLoc(Node))
	.setChain(InChain)
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setTailCall(isTailCall)
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned)
	.setIsPostTypeLegalization(true);

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	if (!CallInfo.second.getNode()) {
	DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
	// It's a tailcall, return the chain (which is the DAG root).
	return DAG.getRoot();
	}

	DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
	return CallInfo.first;
	}

	/// Generate a libcall taking the given operands as arguments
	/// and returning a result of type RetVT.
	SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
	const SDValue *Ops, unsigned NumOps,
	bool isSigned, const SDLoc &dl) {
	TargetLowering::ArgListTy Args;
	Args.reserve(NumOps);

	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0; i != NumOps; ++i) {
	Entry.Node = Ops[i];
	Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}
	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned)
	.setIsPostTypeLegalization(true);

	std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);

	return CallInfo.first;
	}

	// Expand a node into a call to a libcall. Similar to
	// ExpandLibCall except that the first operand is the in-chain.
	std::pair<SDValue, SDValue>
	SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
	SDNode *Node,
	bool isSigned) {
	SDValue InChain = Node->getOperand(0);

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Node->getOperand(i).getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Node = Node->getOperand(i);
	Entry.Ty = ArgTy;
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}
	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = Node->getValueType(0).getTypeForEVT(DAG.getContext());

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(SDLoc(Node))
	.setChain(InChain)
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	return CallInfo;
	}

	SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
	RTLIB::Libcall Call_F32,
	RTLIB::Libcall Call_F64,
	RTLIB::Libcall Call_F80,
	RTLIB::Libcall Call_F128,
	RTLIB::Libcall Call_PPCF128) {
	if (Node->isStrictFPOpcode())
	Node = DAG.mutateStrictFPToFP(Node);

	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::f32: LC = Call_F32; break;
	case MVT::f64: LC = Call_F64; break;
	case MVT::f80: LC = Call_F80; break;
	case MVT::f128: LC = Call_F128; break;
	case MVT::ppcf128: LC = Call_PPCF128; break;
	}
	return ExpandLibCall(LC, Node, false);
	}

	SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
	RTLIB::Libcall Call_I8,
	RTLIB::Libcall Call_I16,
	RTLIB::Libcall Call_I32,
	RTLIB::Libcall Call_I64,
	RTLIB::Libcall Call_I128) {
	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8: LC = Call_I8; break;
	case MVT::i16: LC = Call_I16; break;
	case MVT::i32: LC = Call_I32; break;
	case MVT::i64: LC = Call_I64; break;
	case MVT::i128: LC = Call_I128; break;
	}
	return ExpandLibCall(LC, Node, isSigned);
	}

	/// Issue libcalls to __{u}divmod to compute div / rem pairs.
	void
	SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
	SmallVectorImpl<SDValue> &Results) {
	unsigned Opcode = Node->getOpcode();
	bool isSigned = Opcode == ISD::SDIVREM;

	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
	case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
	case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
	case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
	case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
	}

	// The input chain to this libcall is the entry node of the function.
	// Legalizing the call will automatically add the previous call to the
	// dependence.
	SDValue InChain = DAG.getEntryNode();

	EVT RetVT = Node->getValueType(0);
	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (const SDValue &Op : Node->op_values()) {
	EVT ArgVT = Op.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Node = Op;
	Entry.Ty = ArgTy;
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}

	// Also pass the return address of the remainder.
	SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
	Entry.Node = FIPtr;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);

	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	SDLoc dl(Node);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	// Remainder is loaded back from the stack frame.
	SDValue Rem =
	DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
	Results.push_back(CallInfo.first);
	Results.push_back(Rem);
	}

	/// Return true if sincos libcall is available.
	static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::f32: LC = RTLIB::SINCOS_F32; break;
	case MVT::f64: LC = RTLIB::SINCOS_F64; break;
	case MVT::f80: LC = RTLIB::SINCOS_F80; break;
	case MVT::f128: LC = RTLIB::SINCOS_F128; break;
	case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
	}
	return TLI.getLibcallName(LC) != nullptr;
	}

	/// Only issue sincos libcall if both sin and cos are needed.
	static bool useSinCos(SDNode *Node) {
	unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
	? ISD::FCOS : ISD::FSIN;

	SDValue Op0 = Node->getOperand(0);
	for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
	UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User == Node)
	continue;
	// The other user might have been turned into sincos already.
	if (User->getOpcode() == OtherOpcode \|\| User->getOpcode() == ISD::FSINCOS)
	return true;
	}
	return false;
	}

	/// Issue libcalls to sincos to compute sin / cos pairs.
	void
	SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
	SmallVectorImpl<SDValue> &Results) {
	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::f32: LC = RTLIB::SINCOS_F32; break;
	case MVT::f64: LC = RTLIB::SINCOS_F64; break;
	case MVT::f80: LC = RTLIB::SINCOS_F80; break;
	case MVT::f128: LC = RTLIB::SINCOS_F128; break;
	case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
	}

	// The input chain to this libcall is the entry node of the function.
	// Legalizing the call will automatically add the previous call to the
	// dependence.
	SDValue InChain = DAG.getEntryNode();

	EVT RetVT = Node->getValueType(0);
	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	// Pass the argument.
	Entry.Node = Node->getOperand(0);
	Entry.Ty = RetTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	// Pass the return address of sin.
	SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
	Entry.Node = SinPtr;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	// Also pass the return address of the cos.
	SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
	Entry.Node = CosPtr;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	SDLoc dl(Node);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
	TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
	std::move(Args));

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	Results.push_back(
	DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
	Results.push_back(
	DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
	}

	/// This function is responsible for legalizing a
	/// INT_TO_FP operation of the specified operand when the target requests that
	/// we expand it. At this point, we know that the result and operand types are
	/// legal for the target.
	SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
	EVT DestVT,
	const SDLoc &dl) {
	// TODO: Should any fast-math-flags be set for the created nodes?
	DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
	if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
	DEBUG(dbgs() << "32-bit [signed\|unsigned] integer to float/double "
	"expansion\n");

	// Get the stack frame index of a 8 byte buffer.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);

	// word offset constant for Hi/Lo address computation
	SDValue WordOff = DAG.getConstant(sizeof(int), dl,
	StackSlot.getValueType());
	// set up Hi and Lo (into buffer) address based on endian
	SDValue Hi = StackSlot;
	SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
	StackSlot, WordOff);
	if (DAG.getDataLayout().isLittleEndian())
	std::swap(Hi, Lo);

	// if signed map to unsigned space
	SDValue Op0Mapped;
	if (isSigned) {
	// constant used to invert sign bit (signed to unsigned mapping)
	SDValue SignBit = DAG.getConstant(0x80000000u, dl, MVT::i32);
	Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
	} else {
	Op0Mapped = Op0;
	}
	// store the lo of the constructed double - based on integer input
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op0Mapped, Lo,
	MachinePointerInfo());
	// initial hi portion of constructed double
	SDValue InitialHi = DAG.getConstant(0x43300000u, dl, MVT::i32);
	// store the hi of the constructed double - biased exponent
	SDValue Store2 =
	DAG.getStore(Store1, dl, InitialHi, Hi, MachinePointerInfo());
	// load the constructed double
	SDValue Load =
	DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo());
	// FP constant to bias correct the final result
	SDValue Bias = DAG.getConstantFP(isSigned ?
	BitsToDouble(0x4330000080000000ULL) :
	BitsToDouble(0x4330000000000000ULL),
	dl, MVT::f64);
	// subtract the bias
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
	// final result
	SDValue Result;
	// handle final rounding
	if (DestVT == MVT::f64) {
	// do nothing
	Result = Sub;
	} else if (DestVT.bitsLT(MVT::f64)) {
	Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
	DAG.getIntPtrConstant(0, dl));
	} else if (DestVT.bitsGT(MVT::f64)) {
	Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
	}
	return Result;
	}
	assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
	// Code below here assumes !isSigned without checking again.

	// Implementation of unsigned i64 to f64 following the algorithm in
	// __floatundidf in compiler_rt. This implementation has the advantage
	// of performing rounding correctly, both in the default rounding mode
	// and in all alternate rounding modes.
	// TODO: Generalize this for use with other types.
	if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
	DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
	SDValue TwoP52 =
	DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
	SDValue TwoP84PlusTwoP52 =
	DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl,
	MVT::f64);
	SDValue TwoP84 =
	DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64);

	SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
	SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
	DAG.getConstant(32, dl, MVT::i64));
	SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
	SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
	SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
	SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
	SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
	TwoP84PlusTwoP52);
	return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
	}

	// TODO: Generalize this for use with other types.
	if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
	DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
	// For unsigned conversions, convert them to signed conversions using the
	// algorithm from the x86_64 __floatundidf in compiler_rt.
	if (!isSigned) {
	SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);

	SDValue ShiftConst = DAG.getConstant(
	1, dl, TLI.getShiftAmountTy(Op0.getValueType(), DAG.getDataLayout()));
	SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
	SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
	SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);

	SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
	SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);

	// TODO: This really should be implemented using a branch rather than a
	// select. We happen to get lucky and machinesink does the right
	// thing most of the time. This would be a good candidate for a
	//pseudo-op, or, even better, for whole-function isel.
	SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
	Op0, DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
	return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
	}

	// Otherwise, implement the fully general conversion.

	SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
	DAG.getConstant(UINT64_C(0xfffffffffffff800), dl, MVT::i64));
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
	DAG.getConstant(UINT64_C(0x800), dl, MVT::i64));
	SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
	DAG.getConstant(UINT64_C(0x7ff), dl, MVT::i64));
	SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), And2,
	DAG.getConstant(UINT64_C(0), dl, MVT::i64),
	ISD::SETNE);
	SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
	SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), Op0,
	DAG.getConstant(UINT64_C(0x0020000000000000), dl,
	MVT::i64),
	ISD::SETUGE);
	SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
	EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());

	SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
	DAG.getConstant(32, dl, SHVT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
	SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
	SDValue TwoP32 =
	DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), dl,
	MVT::f64);
	SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
	SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
	SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
	return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);

	SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Op0.getValueType()),
	Op0,
	DAG.getConstant(0, dl, Op0.getValueType()),
	ISD::SETLT);
	SDValue Zero = DAG.getIntPtrConstant(0, dl),
	Four = DAG.getIntPtrConstant(4, dl);
	SDValue CstOffset = DAG.getSelect(dl, Zero.getValueType(),
	SignSet, Four, Zero);

	// If the sign bit of the integer is set, the large number will be treated
	// as a negative number. To counteract this, the dynamic code adds an
	// offset depending on the data type.
	uint64_t FF;
	switch (Op0.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unsupported integer type!");
	case MVT::i8 : FF = 0x43800000ULL; break; // 2^8 (as a float)
	case MVT::i16: FF = 0x47800000ULL; break; // 2^16 (as a float)
	case MVT::i32: FF = 0x4F800000ULL; break; // 2^32 (as a float)
	case MVT::i64: FF = 0x5F800000ULL; break; // 2^64 (as a float)
	}
	if (DAG.getDataLayout().isLittleEndian())
	FF <<= 32;
	Constant *FudgeFactor = ConstantInt::get(
	Type::getInt64Ty(*DAG.getContext()), FF);

	SDValue CPIdx =
	DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
	Alignment = std::min(Alignment, 4u);
	SDValue FudgeInReg;
	if (DestVT == MVT::f32)
	FudgeInReg = DAG.getLoad(
	MVT::f32, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	else {
	SDValue Load = DAG.getExtLoad(
	ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	Alignment);
	HandleSDNode Handle(Load);
	LegalizeOp(Load.getNode());
	FudgeInReg = Handle.getValue();
	}

	return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
	}

	/// This function is responsible for legalizing a
	/// *INT_TO_FP operation of the specified operand when the target requests that
	/// we promote it. At this point, we know that the result and operand types are
	/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
	/// operation that takes a larger input.
	SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT,
	bool isSigned,
	const SDLoc &dl) {
	// First step, figure out the appropriate *INT_TO_FP operation to use.
	EVT NewInTy = LegalOp.getValueType();

	unsigned OpToUse = 0;

	// Scan for the appropriate larger type to use.
	while (true) {
	NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
	assert(NewInTy.isInteger() && "Ran out of possibilities!");

	// If the target supports SINT_TO_FP of this type, use it.
	if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
	OpToUse = ISD::SINT_TO_FP;
	break;
	}
	if (isSigned) continue;

	// If the target supports UINT_TO_FP of this type, use it.
	if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
	OpToUse = ISD::UINT_TO_FP;
	break;
	}

	// Otherwise, try a larger type.
	}

	// Okay, we found the operation and type to use. Zero extend our input to the
	// desired type then run the operation on it.
	return DAG.getNode(OpToUse, dl, DestVT,
	DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
	dl, NewInTy, LegalOp));
	}

	/// This function is responsible for legalizing a
	/// FP_TO_*INT operation of the specified operand when the target requests that
	/// we promote it. At this point, we know that the result and operand types are
	/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
	/// operation that returns a larger result.
	SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT,
	bool isSigned,
	const SDLoc &dl) {
	// First step, figure out the appropriate FP_TO*INT operation to use.
	EVT NewOutTy = DestVT;

	unsigned OpToUse = 0;

	// Scan for the appropriate larger type to use.
	while (true) {
	NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
	assert(NewOutTy.isInteger() && "Ran out of possibilities!");

	// A larger signed type can hold all unsigned values of the requested type,
	// so using FP_TO_SINT is valid
	if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
	OpToUse = ISD::FP_TO_SINT;
	break;
	}

	// However, if the value may be < 0.0, we must use some FP_TO_SINT.
	if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
	OpToUse = ISD::FP_TO_UINT;
	break;
	}

	// Otherwise, try a larger type.
	}

	// Okay, we found the operation and type to use.
	SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);

	// Truncate the result of the extended FP_TO_*INT operation to the desired
	// size.
	return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
	}

	/// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts.
	SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
	EVT VT = Op.getValueType();
	EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	unsigned Sz = VT.getScalarSizeInBits();

	SDValue Tmp, Tmp2, Tmp3;

	// If we can, perform BSWAP first and then the mask+swap the i4, then i2
	// and finally the i1 pairs.
	// TODO: We can easily support i4/i2 legal types if any target ever does.
	if (Sz >= 8 && isPowerOf2_32(Sz)) {
	// Create the masks - repeating the pattern every byte.
	APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
	APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
	for (unsigned J = 0; J != Sz; J += 8) {
	MaskHi4 = MaskHi4 \| (0xF0ull << J);
	MaskLo4 = MaskLo4 \| (0x0Full << J);
	MaskHi2 = MaskHi2 \| (0xCCull << J);
	MaskLo2 = MaskLo2 \| (0x33ull << J);
	MaskHi1 = MaskHi1 \| (0xAAull << J);
	MaskLo1 = MaskLo1 \| (0x55ull << J);
	}

	// BSWAP if the type is wider than a single byte.
	Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);

	// swap i4: ((V & 0xF0) >> 4) \| ((V & 0x0F) << 4)
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, VT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);

	// swap i2: ((V & 0xCC) >> 2) \| ((V & 0x33) << 2)
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, VT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);

	// swap i1: ((V & 0xAA) >> 1) \| ((V & 0x55) << 1)
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, VT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	return Tmp;
	}

	Tmp = DAG.getConstant(0, dl, VT);
	for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
	if (I < J)
	Tmp2 =
	DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT));
	else
	Tmp2 =
	DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));

	APInt Shift(Sz, 1);
	Shift <<= J;
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
	}

	return Tmp;
	}

	/// Open code the operations for BSWAP of the specified operation.
	SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
	EVT VT = Op.getValueType();
	EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
	switch (VT.getSimpleVT().getScalarType().SimpleTy) {
	default: llvm_unreachable("Unhandled Expand type in BSWAP!");
	case MVT::i16:
	Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
	case MVT::i32:
	Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
	DAG.getConstant(0xFF0000, dl, VT));
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
	Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
	Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
	return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
	case MVT::i64:
	Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
	Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
	Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
	Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
	Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7,
	DAG.getConstant(255ULL<<48, dl, VT));
	Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6,
	DAG.getConstant(255ULL<<40, dl, VT));
	Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5,
	DAG.getConstant(255ULL<<32, dl, VT));
	Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
	DAG.getConstant(255ULL<<24, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
	DAG.getConstant(255ULL<<16, dl, VT));
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
	DAG.getConstant(255ULL<<8 , dl, VT));
	Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
	Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
	Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
	Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
	Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
	Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
	return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
	}
	}

	/// Expand the specified bitcount instruction into operations.
	SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
	const SDLoc &dl) {
	switch (Opc) {
	default: llvm_unreachable("Cannot expand this yet!");
	case ISD::CTPOP: {
	EVT VT = Op.getValueType();
	EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	unsigned Len = VT.getSizeInBits();

	assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
	"CTPOP not implemented for this type.");

	// This is the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

	SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)),
	dl, VT);
	SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)),
	dl, VT);
	SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)),
	dl, VT);
	SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)),
	dl, VT);

	// v = v - ((v >> 1) & 0x55555555...)
	Op = DAG.getNode(ISD::SUB, dl, VT, Op,
	DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(1, dl, ShVT)),
	Mask55));
	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	Op = DAG.getNode(ISD::ADD, dl, VT,
	DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
	DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(2, dl, ShVT)),
	Mask33));
	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Op = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(4, dl, ShVT))),
	Mask0F);
	// v = (v * 0x01010101...) >> (Len - 8)
	Op = DAG.getNode(ISD::SRL, dl, VT,
	DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
	DAG.getConstant(Len - 8, dl, ShVT));

	return Op;
	}
	case ISD::CTLZ_ZERO_UNDEF:
	// This trivially expands to CTLZ.
	return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
	case ISD::CTLZ: {
	EVT VT = Op.getValueType();
	unsigned Len = VT.getSizeInBits();

	if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
	EVT SetCCVT = getSetCCResultType(VT);
	SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
	return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
	DAG.getConstant(Len, dl, VT), CTLZ);
	}

	// for now, we do this:
	// x = x \| (x >> 1);
	// x = x \| (x >> 2);
	// ...
	// x = x \| (x >>16);
	// x = x \| (x >>32); // for 64-bit input
	// return popcount(~x);
	//
	// Ref: "Hacker's Delight" by Henry Warren
	EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
	SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
	Op = DAG.getNode(ISD::OR, dl, VT, Op,
	DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
	}
	Op = DAG.getNOT(dl, Op, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT, Op);
	}
	case ISD::CTTZ_ZERO_UNDEF:
	// This trivially expands to CTTZ.
	return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
	case ISD::CTTZ: {
	EVT VT = Op.getValueType();
	unsigned Len = VT.getSizeInBits();

	if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
	EVT SetCCVT = getSetCCResultType(VT);
	SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
	return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
	DAG.getConstant(Len, dl, VT), CTTZ);
	}

	// for now, we use: { return popcount(~x & (x - 1)); }
	// unless the target has ctlz but not ctpop, in which case we use:
	// { return 32 - nlz(~x & (x-1)); }
	// Ref: "Hacker's Delight" by Henry Warren
	SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNOT(dl, Op, VT),
	DAG.getNode(ISD::SUB, dl, VT, Op,
	DAG.getConstant(1, dl, VT)));
	// If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
	if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
	TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
	return DAG.getNode(ISD::SUB, dl, VT,
	DAG.getConstant(VT.getSizeInBits(), dl, VT),
	DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
	return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
	}
	}
	}

	bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
	DEBUG(dbgs() << "Trying to expand node\n");
	SmallVector<SDValue, 8> Results;
	SDLoc dl(Node);
	SDValue Tmp1, Tmp2, Tmp3, Tmp4;
	bool NeedInvert;
	switch (Node->getOpcode()) {
	case ISD::CTPOP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::BITREVERSE:
	Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
	break;
	case ISD::BSWAP:
	Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
	break;
	case ISD::FRAMEADDR:
	case ISD::RETURNADDR:
	case ISD::FRAME_TO_ARGS_OFFSET:
	Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
	break;
	case ISD::EH_DWARF_CFA: {
	SDValue CfaArg = DAG.getSExtOrTrunc(Node->getOperand(0), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Offset = DAG.getNode(ISD::ADD, dl,
	CfaArg.getValueType(),
	DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
	CfaArg.getValueType()),
	CfaArg);
	SDValue FA = DAG.getNode(
	ISD::FRAMEADDR, dl, TLI.getPointerTy(DAG.getDataLayout()),
	DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout())));
	Results.push_back(DAG.getNode(ISD::ADD, dl, FA.getValueType(),
	FA, Offset));
	break;
	}
	case ISD::FLT_ROUNDS_:
	Results.push_back(DAG.getConstant(1, dl, Node->getValueType(0)));
	break;
	case ISD::EH_RETURN:
	case ISD::EH_LABEL:
	case ISD::PREFETCH:
	case ISD::VAEND:
	case ISD::EH_SJLJ_LONGJMP:
	// If the target didn't expand these, there's nothing to do, so just
	// preserve the chain and be done.
	Results.push_back(Node->getOperand(0));
	break;
	case ISD::READCYCLECOUNTER:
	// If the target didn't expand this, just return 'zero' and preserve the
	// chain.
	Results.append(Node->getNumValues() - 1,
	DAG.getConstant(0, dl, Node->getValueType(0)));
	Results.push_back(Node->getOperand(0));
	break;
	case ISD::EH_SJLJ_SETJMP:
	// If the target didn't expand this, just return 'zero' and preserve the
	// chain.
	Results.push_back(DAG.getConstant(0, dl, MVT::i32));
	Results.push_back(Node->getOperand(0));
	break;
	case ISD::ATOMIC_LOAD: {
	// There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
	SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
	SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
	SDValue Swap = DAG.getAtomicCmpSwap(
	ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
	Node->getOperand(0), Node->getOperand(1), Zero, Zero,
	cast<AtomicSDNode>(Node)->getMemOperand());
	Results.push_back(Swap.getValue(0));
	Results.push_back(Swap.getValue(1));
	break;
	}
	case ISD::ATOMIC_STORE: {
	// There is no libcall for atomic store; fake it with ATOMIC_SWAP.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	Results.push_back(Swap.getValue(1));
	break;
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	// Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
	// splits out the success value as a comparison. Expanding the resulting
	// ATOMIC_CMP_SWAP will produce a libcall.
	SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
	SDValue Res = DAG.getAtomicCmpSwap(
	ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
	Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
	Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand());

	SDValue ExtRes = Res;
	SDValue LHS = Res;
	SDValue RHS = Node->getOperand(1);

	EVT AtomicType = cast<AtomicSDNode>(Node)->getMemoryVT();
	EVT OuterType = Node->getValueType(0);
	switch (TLI.getExtendForAtomicOps()) {
	case ISD::SIGN_EXTEND:
	LHS = DAG.getNode(ISD::AssertSext, dl, OuterType, Res,
	DAG.getValueType(AtomicType));
	RHS = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OuterType,
	Node->getOperand(2), DAG.getValueType(AtomicType));
	ExtRes = LHS;
	break;
	case ISD::ZERO_EXTEND:
	LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
	DAG.getValueType(AtomicType));
	- RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
	+ RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
	ExtRes = LHS;
	break;
	case ISD::ANY_EXTEND:
	LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
	- RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
	+ RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
	break;
	default:
	llvm_unreachable("Invalid atomic op extension");
	}

	SDValue Success =
	DAG.getSetCC(dl, Node->getValueType(1), LHS, RHS, ISD::SETEQ);

	Results.push_back(ExtRes.getValue(0));
	Results.push_back(Success);
	Results.push_back(Res.getValue(1));
	break;
	}
	case ISD::DYNAMIC_STACKALLOC:
	ExpandDYNAMIC_STACKALLOC(Node, Results);
	break;
	case ISD::MERGE_VALUES:
	for (unsigned i = 0; i < Node->getNumValues(); i++)
	Results.push_back(Node->getOperand(i));
	break;
	case ISD::UNDEF: {
	EVT VT = Node->getValueType(0);
	if (VT.isInteger())
	Results.push_back(DAG.getConstant(0, dl, VT));
	else {
	assert(VT.isFloatingPoint() && "Unknown value type!");
	Results.push_back(DAG.getConstantFP(0, dl, VT));
	}
	break;
	}
	case ISD::FP_ROUND:
	case ISD::BITCAST:
	Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
	Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::FP_EXTEND:
	Tmp1 = EmitStackConvert(Node->getOperand(0),
	Node->getOperand(0).getValueType(),
	Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::SIGN_EXTEND_INREG: {
	EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
	EVT VT = Node->getValueType(0);

	// An in-register sign-extend of a boolean is a negation:
	// 'true' (1) sign-extended is -1.
	// 'false' (0) sign-extended is 0.
	// However, we must mask the high bits of the source operand because the
	// SIGN_EXTEND_INREG does not guarantee that the high bits are already zero.

	// TODO: Do this for vectors too?
	if (ExtraVT.getSizeInBits() == 1) {
	SDValue One = DAG.getConstant(1, dl, VT);
	SDValue And = DAG.getNode(ISD::AND, dl, VT, Node->getOperand(0), One);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, Zero, And);
	Results.push_back(Neg);
	break;
	}

	// NOTE: we could fall back on load/store here too for targets without
	// SRA. However, it is doubtful that any exist.
	EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	unsigned BitsDiff = VT.getScalarSizeInBits() -
	ExtraVT.getScalarSizeInBits();
	SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
	Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
	Node->getOperand(0), ShiftCst);
	Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::FP_ROUND_INREG: {
	// The only way we can lower this is to turn it into a TRUNCSTORE,
	// EXTLOAD pair, targeting a temporary location (a stack slot).

	// NOTE: there is a choice here between constantly creating new stack
	// slots and always reusing the same one. We currently always create
	// new ones, as reuse may inhibit scheduling.
	EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
	Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
	Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
	Node->getOperand(0), Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::FP_TO_SINT:
	if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
	Results.push_back(Tmp1);
	break;
	case ISD::FP_TO_UINT: {
	SDValue True, False;
	EVT VT = Node->getOperand(0).getValueType();
	EVT NVT = Node->getValueType(0);
	APFloat apf(DAG.EVTToAPFloatSemantics(VT),
	APInt::getNullValue(VT.getSizeInBits()));
	APInt x = APInt::getSignMask(NVT.getSizeInBits());
	(void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
	Tmp1 = DAG.getConstantFP(apf, dl, VT);
	Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
	Node->getOperand(0),
	Tmp1, ISD::SETLT);
	True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
	// TODO: Should any fast-math-flags be set for the FSUB?
	False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
	DAG.getNode(ISD::FSUB, dl, VT,
	Node->getOperand(0), Tmp1));
	False = DAG.getNode(ISD::XOR, dl, NVT, False,
	DAG.getConstant(x, dl, NVT));
	Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::VAARG:
	Results.push_back(DAG.expandVAArg(Node));
	Results.push_back(Results[0].getValue(1));
	break;
	case ISD::VACOPY:
	Results.push_back(DAG.expandVACopy(Node));
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
	// This must be an access of the only element. Return it.
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0),
	Node->getOperand(0));
	else
	Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
	Results.push_back(Tmp1);
	break;
	case ISD::EXTRACT_SUBVECTOR:
	Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
	break;
	case ISD::INSERT_SUBVECTOR:
	Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
	break;
	case ISD::CONCAT_VECTORS:
	Results.push_back(ExpandVectorBuildThroughStack(Node));
	break;
	case ISD::SCALAR_TO_VECTOR:
	Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
	break;
	case ISD::INSERT_VECTOR_ELT:
	Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
	Node->getOperand(1),
	Node->getOperand(2), dl));
	break;
	case ISD::VECTOR_SHUFFLE: {
	SmallVector<int, 32> NewMask;
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();

	EVT VT = Node->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	if (!TLI.isTypeLegal(EltVT)) {
	EVT NewEltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);

	// BUILD_VECTOR operands are allowed to be wider than the element type.
	// But if NewEltVT is smaller that EltVT the BUILD_VECTOR does not accept
	// it.
	if (NewEltVT.bitsLT(EltVT)) {
	// Convert shuffle node.
	// If original node was v4i64 and the new EltVT is i32,
	// cast operands to v8i32 and re-build the mask.

	// Calculate new VT, the size of the new VT should be equal to original.
	EVT NewVT =
	EVT::getVectorVT(*DAG.getContext(), NewEltVT,
	VT.getSizeInBits() / NewEltVT.getSizeInBits());
	assert(NewVT.bitsEq(VT));

	// cast operands to new VT
	Op0 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op0);
	Op1 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op1);

	// Convert the shuffle mask
	unsigned int factor =
	NewVT.getVectorNumElements()/VT.getVectorNumElements();

	// EltVT gets smaller
	assert(factor > 0);

	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	if (Mask[i] < 0) {
	for (unsigned fi = 0; fi < factor; ++fi)
	NewMask.push_back(Mask[i]);
	}
	else {
	for (unsigned fi = 0; fi < factor; ++fi)
	NewMask.push_back(Mask[i]*factor+fi);
	}
	}
	Mask = NewMask;
	VT = NewVT;
	}
	EltVT = NewEltVT;
	}
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i != NumElems; ++i) {
	if (Mask[i] < 0) {
	Ops.push_back(DAG.getUNDEF(EltVT));
	continue;
	}
	unsigned Idx = Mask[i];
	if (Idx < NumElems)
	Ops.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
	else
	Ops.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
	DAG.getConstant(Idx - NumElems, dl,
	TLI.getVectorIdxTy(DAG.getDataLayout()))));
	}

	Tmp1 = DAG.getBuildVector(VT, dl, Ops);
	// We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	EVT OpTy = Node->getOperand(0).getValueType();
	if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
	// 1 -> Hi
	Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
	DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
	TLI.getShiftAmountTy(
	Node->getOperand(0).getValueType(),
	DAG.getDataLayout())));
	Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
	} else {
	// 0 -> Lo
	Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
	Node->getOperand(0));
	}
	Results.push_back(Tmp1);
	break;
	}
	case ISD::STACKSAVE:
	// Expand to CopyFromReg if the target set
	// StackPointerRegisterToSaveRestore.
	if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
	Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
	Node->getValueType(0)));
	Results.push_back(Results[0].getValue(1));
	} else {
	Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
	Results.push_back(Node->getOperand(0));
	}
	break;
	case ISD::STACKRESTORE:
	// Expand to CopyToReg if the target set
	// StackPointerRegisterToSaveRestore.
	if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
	Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
	Node->getOperand(1)));
	} else {
	Results.push_back(Node->getOperand(0));
	}
	break;
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
	Results.push_back(Results[0].getValue(0));
	break;
	case ISD::FCOPYSIGN:
	Results.push_back(ExpandFCOPYSIGN(Node));
	break;
	case ISD::FNEG:
	// Expand Y = FNEG(X) -> Y = SUB -0.0, X
	Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0));
	// TODO: If FNEG has fast-math-flags, propagate them to the FSUB.
	Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
	Node->getOperand(0));
	Results.push_back(Tmp1);
	break;
	case ISD::FABS:
	Results.push_back(ExpandFABS(Node));
	break;
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX: {
	// Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
	ISD::CondCode Pred;
	switch (Node->getOpcode()) {
	default: llvm_unreachable("How did we get here?");
	case ISD::SMAX: Pred = ISD::SETGT; break;
	case ISD::SMIN: Pred = ISD::SETLT; break;
	case ISD::UMAX: Pred = ISD::SETUGT; break;
	case ISD::UMIN: Pred = ISD::SETULT; break;
	}
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp1, Tmp2, Pred);
	Results.push_back(Tmp1);
	break;
	}

	case ISD::FSIN:
	case ISD::FCOS: {
	EVT VT = Node->getValueType(0);
	// Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
	// fcos which share the same operand and both are used.
	if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) \|\|
	isSinCosLibcallAvailable(Node, TLI))
	&& useSinCos(Node)) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
	if (Node->getOpcode() == ISD::FCOS)
	Tmp1 = Tmp1.getValue(1);
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::FMAD:
	llvm_unreachable("Illegal fmad should never be formed");

	case ISD::FP16_TO_FP:
	if (Node->getValueType(0) != MVT::f32) {
	// We can extend to types bigger than f32 in two steps without changing
	// the result. Since "f16 -> f32" is much more commonly available, give
	// CodeGen the option of emitting that before resorting to a libcall.
	SDValue Res =
	DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
	Results.push_back(
	DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
	}
	break;
	case ISD::FP_TO_FP16:
	DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
	if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
	SDValue Op = Node->getOperand(0);
	MVT SVT = Op.getSimpleValueType();
	if ((SVT == MVT::f64 \|\| SVT == MVT::f80) &&
	TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
	// Under fastmath, we can expand this node into a fround followed by
	// a float-half conversion.
	SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(
	DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal));
	}
	}
	break;
	case ISD::ConstantFP: {
	ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
	// Check to see if this FP immediate is already legal.
	// If this is a legal constant, turn it into a TargetConstantFP node.
	if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0)))
	Results.push_back(ExpandConstantFP(CFP, true));
	break;
	}
	case ISD::Constant: {
	ConstantSDNode *CP = cast<ConstantSDNode>(Node);
	Results.push_back(ExpandConstant(CP));
	break;
	}
	case ISD::FSUB: {
	EVT VT = Node->getValueType(0);
	if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
	TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
	const SDNodeFlags Flags = Node->getFlags();
	Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
	Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::SUB: {
	EVT VT = Node->getValueType(0);
	assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
	TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
	"Don't know how to expand this subtraction!");
	Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
	DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
	VT));
	Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
	Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
	break;
	}
	case ISD::UREM:
	case ISD::SREM: {
	EVT VT = Node->getValueType(0);
	bool isSigned = Node->getOpcode() == ISD::SREM;
	unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
	Tmp2 = Node->getOperand(0);
	Tmp3 = Node->getOperand(1);
	if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
	Results.push_back(Tmp1);
	} else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
	// X % Y -> X-X/Y*Y
	Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
	Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
	Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::UDIV:
	case ISD::SDIV: {
	bool isSigned = Node->getOpcode() == ISD::SDIV;
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
	EVT VT = Node->getValueType(0);
	if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
	Node->getOperand(1));
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::MULHU:
	case ISD::MULHS: {
	unsigned ExpandOpcode =
	Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI : ISD::SMUL_LOHI;
	EVT VT = Node->getValueType(0);
	SDVTList VTs = DAG.getVTList(VT, VT);

	Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
	Node->getOperand(1));
	Results.push_back(Tmp1.getValue(1));
	break;
	}
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: {
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned MULHOpcode =
	Node->getOpcode() == ISD::UMUL_LOHI ? ISD::MULHU : ISD::MULHS;

	if (TLI.isOperationLegalOrCustom(MULHOpcode, VT)) {
	Results.push_back(DAG.getNode(ISD::MUL, dl, VT, LHS, RHS));
	Results.push_back(DAG.getNode(MULHOpcode, dl, VT, LHS, RHS));
	break;
	}

	SmallVector<SDValue, 4> Halves;
	EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
	assert(TLI.isTypeLegal(HalfType));
	if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, Node, LHS, RHS, Halves,
	HalfType, DAG,
	TargetLowering::MulExpansionKind::Always)) {
	for (unsigned i = 0; i < 2; ++i) {
	SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
	SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
	SDValue Shift = DAG.getConstant(
	HalfType.getScalarSizeInBits(), dl,
	TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
	Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
	Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
	}
	break;
	}
	break;
	}
	case ISD::MUL: {
	EVT VT = Node->getValueType(0);
	SDVTList VTs = DAG.getVTList(VT, VT);
	// See if multiply or divide can be lowered using two-result operations.
	// We just need the low half of the multiply; try both the signed
	// and unsigned forms. If the target supports both SMUL_LOHI and
	// UMUL_LOHI, form a preference by checking which forms of plain
	// MULH it supports.
	bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
	bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
	bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
	bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
	unsigned OpToUse = 0;
	if (HasSMUL_LOHI && !HasMULHS) {
	OpToUse = ISD::SMUL_LOHI;
	} else if (HasUMUL_LOHI && !HasMULHU) {
	OpToUse = ISD::UMUL_LOHI;
	} else if (HasSMUL_LOHI) {
	OpToUse = ISD::SMUL_LOHI;
	} else if (HasUMUL_LOHI) {
	OpToUse = ISD::UMUL_LOHI;
	}
	if (OpToUse) {
	Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
	Node->getOperand(1)));
	break;
	}

	SDValue Lo, Hi;
	EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
	if (TLI.isOperationLegalOrCustom(ISD::ZERO_EXTEND, VT) &&
	TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
	TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
	TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
	TLI.expandMUL(Node, Lo, Hi, HalfType, DAG,
	TargetLowering::MulExpansionKind::OnlyLegalOrCustom)) {
	Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
	Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
	SDValue Shift =
	DAG.getConstant(HalfType.getSizeInBits(), dl,
	TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
	Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
	Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
	}
	break;
	}
	case ISD::SADDO:
	case ISD::SSUBO: {
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
	ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
	LHS, RHS);
	Results.push_back(Sum);
	EVT ResultType = Node->getValueType(1);
	EVT OType = getSetCCResultType(Node->getValueType(0));

	SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());

	// LHSSign -> LHS >= 0
	// RHSSign -> RHS >= 0
	// SumSign -> Sum >= 0
	//
	// Add:
	// Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
	// Sub:
	// Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
	SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
	SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
	SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
	Node->getOpcode() == ISD::SADDO ?
	ISD::SETEQ : ISD::SETNE);

	SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
	SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);

	SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
	Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType));
	break;
	}
	case ISD::UADDO:
	case ISD::USUBO: {
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
	ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
	LHS, RHS);
	Results.push_back(Sum);

	EVT ResultType = Node->getValueType(1);
	EVT SetCCType = getSetCCResultType(Node->getValueType(0));
	ISD::CondCode CC
	= Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
	SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);

	Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
	break;
	}
	case ISD::UMULO:
	case ISD::SMULO: {
	EVT VT = Node->getValueType(0);
	EVT WideVT = EVT::getIntegerVT(DAG.getContext(), VT.getSizeInBits() 2);
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	SDValue BottomHalf;
	SDValue TopHalf;
	static const unsigned Ops[2][3] =
	{ { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
	{ ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
	bool isSigned = Node->getOpcode() == ISD::SMULO;
	if (TLI.isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
	BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
	TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
	} else if (TLI.isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
	BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	TopHalf = BottomHalf.getValue(1);
	} else if (TLI.isTypeLegal(WideVT)) {
	LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
	RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
	Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
	BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
	DAG.getIntPtrConstant(0, dl));
	TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
	DAG.getIntPtrConstant(1, dl));
	} else {
	// We can fall back to a libcall with an illegal type for the MUL if we
	// have a libcall big enough.
	// Also, we can fall back to a division in some cases, but that's a big
	// performance hit in the general case.
	RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
	if (WideVT == MVT::i16)
	LC = RTLIB::MUL_I16;
	else if (WideVT == MVT::i32)
	LC = RTLIB::MUL_I32;
	else if (WideVT == MVT::i64)
	LC = RTLIB::MUL_I64;
	else if (WideVT == MVT::i128)
	LC = RTLIB::MUL_I128;
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");

	SDValue HiLHS;
	SDValue HiRHS;
	if (isSigned) {
	// The high part is obtained by SRA'ing all but one of the bits of low
	// part.
	unsigned LoSize = VT.getSizeInBits();
	HiLHS =
	DAG.getNode(ISD::SRA, dl, VT, LHS,
	DAG.getConstant(LoSize - 1, dl,
	TLI.getPointerTy(DAG.getDataLayout())));
	HiRHS =
	DAG.getNode(ISD::SRA, dl, VT, RHS,
	DAG.getConstant(LoSize - 1, dl,
	TLI.getPointerTy(DAG.getDataLayout())));
	} else {
	HiLHS = DAG.getConstant(0, dl, VT);
	HiRHS = DAG.getConstant(0, dl, VT);
	}

	// Here we're passing the 2 arguments explicitly as 4 arguments that are
	// pre-lowered to the correct types. This all depends upon WideVT not
	// being a legal type for the architecture and thus has to be split to
	// two arguments.
	SDValue Ret;
	if(DAG.getDataLayout().isLittleEndian()) {
	// Halves of WideVT are packed into registers in different order
	// depending on platform endianness. This is usually handled by
	// the C calling convention, but we can't defer to it in
	// the legalizer.
	SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
	Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
	} else {
	SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
	Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
	}
	assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
	"Ret value is a collection of constituent nodes holding result.");
	BottomHalf = Ret.getOperand(0);
	TopHalf = Ret.getOperand(1);
	}

	if (isSigned) {
	Tmp1 = DAG.getConstant(
	VT.getSizeInBits() - 1, dl,
	TLI.getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
	Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
	TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf, Tmp1,
	ISD::SETNE);
	} else {
	TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf,
	DAG.getConstant(0, dl, VT), ISD::SETNE);
	}

	// Truncate the result if SetCC returns a larger type than needed.
	EVT RType = Node->getValueType(1);
	if (RType.getSizeInBits() < TopHalf.getValueSizeInBits())
	TopHalf = DAG.getNode(ISD::TRUNCATE, dl, RType, TopHalf);

	assert(RType.getSizeInBits() == TopHalf.getValueSizeInBits() &&
	"Unexpected result type for S/UMULO legalization");

	Results.push_back(BottomHalf);
	Results.push_back(TopHalf);
	break;
	}
	case ISD::BUILD_PAIR: {
	EVT PairTy = Node->getValueType(0);
	Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
	Tmp2 = DAG.getNode(
	ISD::SHL, dl, PairTy, Tmp2,
	DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
	TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
	Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
	break;
	}
	case ISD::SELECT:
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	Tmp3 = Node->getOperand(2);
	if (Tmp1.getOpcode() == ISD::SETCC) {
	Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
	Tmp2, Tmp3,
	cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
	} else {
	Tmp1 = DAG.getSelectCC(dl, Tmp1,
	DAG.getConstant(0, dl, Tmp1.getValueType()),
	Tmp2, Tmp3, ISD::SETNE);
	}
	Results.push_back(Tmp1);
	break;
	case ISD::BR_JT: {
	SDValue Chain = Node->getOperand(0);
	SDValue Table = Node->getOperand(1);
	SDValue Index = Node->getOperand(2);

	const DataLayout &TD = DAG.getDataLayout();
	EVT PTy = TLI.getPointerTy(TD);

	unsigned EntrySize =
	DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);

	Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
	DAG.getConstant(EntrySize, dl, Index.getValueType()));
	SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
	Index, Table);

	EVT MemVT = EVT::getIntegerVT(DAG.getContext(), EntrySize 8);
	SDValue LD = DAG.getExtLoad(
	ISD::SEXTLOAD, dl, PTy, Chain, Addr,
	MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
	Addr = LD;
	if (TLI.isJumpTableRelative()) {
	// For PIC, the sequence is:
	// BRIND(load(Jumptable + index) + RelocBase)
	// RelocBase can be JumpTable, GOT or some sort of global base.
	Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
	TLI.getPICJumpTableRelocBase(Table, DAG));
	}
	Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::BRCOND:
	// Expand brcond's setcc into its constituent parts and create a BR_CC
	// Node.
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	if (Tmp2.getOpcode() == ISD::SETCC) {
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
	Tmp1, Tmp2.getOperand(2),
	Tmp2.getOperand(0), Tmp2.getOperand(1),
	Node->getOperand(2));
	} else {
	// We test only the i1 bit. Skip the AND if UNDEF or another AND.
	if (Tmp2.isUndef() \|\|
	(Tmp2.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(Tmp2.getOperand(1)) &&
	dyn_cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1))
	Tmp3 = Tmp2;
	else
	Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
	DAG.getConstant(1, dl, Tmp2.getValueType()));
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
	DAG.getCondCode(ISD::SETNE), Tmp3,
	DAG.getConstant(0, dl, Tmp3.getValueType()),
	Node->getOperand(2));
	}
	Results.push_back(Tmp1);
	break;
	case ISD::SETCC: {
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	Tmp3 = Node->getOperand(2);
	bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
	Tmp3, NeedInvert, dl);

	if (Legalized) {
	// If we expanded the SETCC by swapping LHS and RHS, or by inverting the
	// condition code, create a new SETCC node.
	if (Tmp3.getNode())
	Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
	Tmp1, Tmp2, Tmp3);

	// If we expanded the SETCC by inverting the condition code, then wrap
	// the existing SETCC in a NOT to restore the intended condition.
	if (NeedInvert)
	Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));

	Results.push_back(Tmp1);
	break;
	}

	// Otherwise, SETCC for the given comparison type must be completely
	// illegal; expand it into a SELECT_CC.
	EVT VT = Node->getValueType(0);
	int TrueValue;
	switch (TLI.getBooleanContents(Tmp1->getValueType(0))) {
	case TargetLowering::ZeroOrOneBooleanContent:
	case TargetLowering::UndefinedBooleanContent:
	TrueValue = 1;
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	TrueValue = -1;
	break;
	}
	Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
	DAG.getConstant(TrueValue, dl, VT),
	DAG.getConstant(0, dl, VT),
	Tmp3);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::SELECT_CC: {
	Tmp1 = Node->getOperand(0); // LHS
	Tmp2 = Node->getOperand(1); // RHS
	Tmp3 = Node->getOperand(2); // True
	Tmp4 = Node->getOperand(3); // False
	EVT VT = Node->getValueType(0);
	SDValue CC = Node->getOperand(4);
	ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();

	if (TLI.isCondCodeLegal(CCOp, Tmp1.getSimpleValueType())) {
	// If the condition code is legal, then we need to expand this
	// node using SETCC and SELECT.
	EVT CmpVT = Tmp1.getValueType();
	assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
	"Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
	"expanded.");
	EVT CCVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
	SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
	Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
	break;
	}

	// SELECT_CC is legal, so the condition code must not be.
	bool Legalized = false;
	// Try to legalize by inverting the condition. This is for targets that
	// might support an ordered version of a condition, but not the unordered
	// version (or vice versa).
	ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
	Tmp1.getValueType().isInteger());
	if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
	// Use the new condition code and swap true and false
	Legalized = true;
	Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
	} else {
	// If The inverse is not legal, then try to swap the arguments using
	// the inverse condition code.
	ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
	if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
	// The swapped inverse condition is legal, so swap true and false,
	// lhs and rhs.
	Legalized = true;
	Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
	}
	}

	if (!Legalized) {
	Legalized = LegalizeSetCCCondCode(
	getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
	dl);

	assert(Legalized && "Can't legalize SELECT_CC with legal condition!");

	// If we expanded the SETCC by inverting the condition code, then swap
	// the True/False operands to match.
	if (NeedInvert)
	std::swap(Tmp3, Tmp4);

	// If we expanded the SETCC by swapping LHS and RHS, or by inverting the
	// condition code, create a new SELECT_CC node.
	if (CC.getNode()) {
	Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
	Tmp1, Tmp2, Tmp3, Tmp4, CC);
	} else {
	Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
	CC = DAG.getCondCode(ISD::SETNE);
	Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
	Tmp2, Tmp3, Tmp4, CC);
	}
	}
	Results.push_back(Tmp1);
	break;
	}
	case ISD::BR_CC: {
	Tmp1 = Node->getOperand(0); // Chain
	Tmp2 = Node->getOperand(2); // LHS
	Tmp3 = Node->getOperand(3); // RHS
	Tmp4 = Node->getOperand(1); // CC

	bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
	Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
	(void)Legalized;
	assert(Legalized && "Can't legalize BR_CC with legal condition!");

	// If we expanded the SETCC by inverting the condition code, then wrap
	// the existing SETCC in a NOT to restore the intended condition.
	if (NeedInvert)
	Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));

	// If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
	// node.
	if (Tmp4.getNode()) {
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
	Tmp4, Tmp2, Tmp3, Node->getOperand(4));
	} else {
	Tmp3 = DAG.getConstant(0, dl, Tmp2.getValueType());
	Tmp4 = DAG.getCondCode(ISD::SETNE);
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4,
	Tmp2, Tmp3, Node->getOperand(4));
	}
	Results.push_back(Tmp1);
	break;
	}
	case ISD::BUILD_VECTOR:
	Results.push_back(ExpandBUILD_VECTOR(Node));
	break;
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: {
	// Scalarize vector SRA/SRL/SHL.
	EVT VT = Node->getValueType(0);
	assert(VT.isVector() && "Unable to legalize non-vector shift");
	assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
	unsigned NumElem = VT.getVectorNumElements();

	SmallVector<SDValue, 8> Scalars;
	for (unsigned Idx = 0; Idx < NumElem; Idx++) {
	SDValue Ex = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(0),
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	SDValue Sh = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(1),
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
	VT.getScalarType(), Ex, Sh));
	}

	SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
	ReplaceNode(SDValue(Node, 0), Result);
	break;
	}
	case ISD::GLOBAL_OFFSET_TABLE:
	case ISD::GlobalAddress:
	case ISD::GlobalTLSAddress:
	case ISD::ExternalSymbol:
	case ISD::ConstantPool:
	case ISD::JumpTable:
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID:
	// FIXME: Custom lowering for these operations shouldn't return null!
	break;
	}

	// Replace the original node with the legalized result.
	if (Results.empty()) {
	DEBUG(dbgs() << "Cannot expand node\n");
	return false;
	}

	DEBUG(dbgs() << "Succesfully expanded node\n");
	ReplaceNode(Node, Results.data());
	return true;
	}

	void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
	DEBUG(dbgs() << "Trying to convert node to libcall\n");
	SmallVector<SDValue, 8> Results;
	SDLoc dl(Node);
	unsigned Opc = Node->getOpcode();
	switch (Opc) {
	case ISD::ATOMIC_FENCE: {
	// If the target didn't lower this, lower it to '__sync_synchronize()' call
	// FIXME: handle "fence singlethread" more efficiently.
	TargetLowering::ArgListTy Args;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(Node->getOperand(0))
	.setLibCallee(
	CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol("__sync_synchronize",
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	Results.push_back(CallResult.second);
	break;
	}
	// By default, atomic intrinsics are marked Legal and lowered. Targets
	// which don't support them directly, however, may want libcalls, in which
	// case they mark them Expand, and we get here.
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_CMP_SWAP: {
	MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
	RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");

	std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false);
	Results.push_back(Tmp.first);
	Results.push_back(Tmp.second);
	break;
	}
	case ISD::TRAP: {
	// If this operation is not supported, lower it to 'abort()' call
	TargetLowering::ArgListTy Args;
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(Node->getOperand(0))
	.setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol(
	"abort", TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));
	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	Results.push_back(CallResult.second);
	break;
	}
	case ISD::FMINNUM:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
	RTLIB::FMIN_F80, RTLIB::FMIN_F128,
	RTLIB::FMIN_PPCF128));
	break;
	case ISD::FMAXNUM:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
	RTLIB::FMAX_F80, RTLIB::FMAX_F128,
	RTLIB::FMAX_PPCF128));
	break;
	case ISD::FSQRT:
	case ISD::STRICT_FSQRT:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
	RTLIB::SQRT_F80, RTLIB::SQRT_F128,
	RTLIB::SQRT_PPCF128));
	break;
	case ISD::FSIN:
	case ISD::STRICT_FSIN:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
	RTLIB::SIN_F80, RTLIB::SIN_F128,
	RTLIB::SIN_PPCF128));
	break;
	case ISD::FCOS:
	case ISD::STRICT_FCOS:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
	RTLIB::COS_F80, RTLIB::COS_F128,
	RTLIB::COS_PPCF128));
	break;
	case ISD::FSINCOS:
	// Expand into sincos libcall.
	ExpandSinCosLibCall(Node, Results);
	break;
	case ISD::FLOG:
	case ISD::STRICT_FLOG:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
	RTLIB::LOG_F80, RTLIB::LOG_F128,
	RTLIB::LOG_PPCF128));
	break;
	case ISD::FLOG2:
	case ISD::STRICT_FLOG2:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
	RTLIB::LOG2_F80, RTLIB::LOG2_F128,
	RTLIB::LOG2_PPCF128));
	break;
	case ISD::FLOG10:
	case ISD::STRICT_FLOG10:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
	RTLIB::LOG10_F80, RTLIB::LOG10_F128,
	RTLIB::LOG10_PPCF128));
	break;
	case ISD::FEXP:
	case ISD::STRICT_FEXP:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
	RTLIB::EXP_F80, RTLIB::EXP_F128,
	RTLIB::EXP_PPCF128));
	break;
	case ISD::FEXP2:
	case ISD::STRICT_FEXP2:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
	RTLIB::EXP2_F80, RTLIB::EXP2_F128,
	RTLIB::EXP2_PPCF128));
	break;
	case ISD::FTRUNC:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
	RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
	RTLIB::TRUNC_PPCF128));
	break;
	case ISD::FFLOOR:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
	RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
	RTLIB::FLOOR_PPCF128));
	break;
	case ISD::FCEIL:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
	RTLIB::CEIL_F80, RTLIB::CEIL_F128,
	RTLIB::CEIL_PPCF128));
	break;
	case ISD::FRINT:
	case ISD::STRICT_FRINT:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
	RTLIB::RINT_F80, RTLIB::RINT_F128,
	RTLIB::RINT_PPCF128));
	break;
	case ISD::FNEARBYINT:
	case ISD::STRICT_FNEARBYINT:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
	RTLIB::NEARBYINT_F64,
	RTLIB::NEARBYINT_F80,
	RTLIB::NEARBYINT_F128,
	RTLIB::NEARBYINT_PPCF128));
	break;
	case ISD::FROUND:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
	RTLIB::ROUND_F64,
	RTLIB::ROUND_F80,
	RTLIB::ROUND_F128,
	RTLIB::ROUND_PPCF128));
	break;
	case ISD::FPOWI:
	case ISD::STRICT_FPOWI:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
	RTLIB::POWI_F80, RTLIB::POWI_F128,
	RTLIB::POWI_PPCF128));
	break;
	case ISD::FPOW:
	case ISD::STRICT_FPOW:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
	RTLIB::POW_F80, RTLIB::POW_F128,
	RTLIB::POW_PPCF128));
	break;
	case ISD::FDIV:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
	RTLIB::DIV_F80, RTLIB::DIV_F128,
	RTLIB::DIV_PPCF128));
	break;
	case ISD::FREM:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
	RTLIB::REM_F80, RTLIB::REM_F128,
	RTLIB::REM_PPCF128));
	break;
	case ISD::FMA:
	case ISD::STRICT_FMA:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
	RTLIB::FMA_F80, RTLIB::FMA_F128,
	RTLIB::FMA_PPCF128));
	break;
	case ISD::FADD:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
	RTLIB::ADD_F80, RTLIB::ADD_F128,
	RTLIB::ADD_PPCF128));
	break;
	case ISD::FMUL:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
	RTLIB::MUL_F80, RTLIB::MUL_F128,
	RTLIB::MUL_PPCF128));
	break;
	case ISD::FP16_TO_FP:
	if (Node->getValueType(0) == MVT::f32) {
	Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
	}
	break;
	case ISD::FP_TO_FP16: {
	RTLIB::Libcall LC =
	RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
	Results.push_back(ExpandLibCall(LC, Node, false));
	break;
	}
	case ISD::FSUB:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
	RTLIB::SUB_F80, RTLIB::SUB_F128,
	RTLIB::SUB_PPCF128));
	break;
	case ISD::SREM:
	Results.push_back(ExpandIntLibCall(Node, true,
	RTLIB::SREM_I8,
	RTLIB::SREM_I16, RTLIB::SREM_I32,
	RTLIB::SREM_I64, RTLIB::SREM_I128));
	break;
	case ISD::UREM:
	Results.push_back(ExpandIntLibCall(Node, false,
	RTLIB::UREM_I8,
	RTLIB::UREM_I16, RTLIB::UREM_I32,
	RTLIB::UREM_I64, RTLIB::UREM_I128));
	break;
	case ISD::SDIV:
	Results.push_back(ExpandIntLibCall(Node, true,
	RTLIB::SDIV_I8,
	RTLIB::SDIV_I16, RTLIB::SDIV_I32,
	RTLIB::SDIV_I64, RTLIB::SDIV_I128));
	break;
	case ISD::UDIV:
	Results.push_back(ExpandIntLibCall(Node, false,
	RTLIB::UDIV_I8,
	RTLIB::UDIV_I16, RTLIB::UDIV_I32,
	RTLIB::UDIV_I64, RTLIB::UDIV_I128));
	break;
	case ISD::SDIVREM:
	case ISD::UDIVREM:
	// Expand into divrem libcall
	ExpandDivRemLibCall(Node, Results);
	break;
	case ISD::MUL:
	Results.push_back(ExpandIntLibCall(Node, false,
	RTLIB::MUL_I8,
	RTLIB::MUL_I16, RTLIB::MUL_I32,
	RTLIB::MUL_I64, RTLIB::MUL_I128));
	break;
	}

	// Replace the original node with the legalized result.
	if (!Results.empty()) {
	DEBUG(dbgs() << "Successfully converted node to libcall\n");
	ReplaceNode(Node, Results.data());
	} else
	DEBUG(dbgs() << "Could not convert node to libcall\n");
	}

	// Determine the vector type to use in place of an original scalar element when
	// promoting equally sized vectors.
	static MVT getPromotedVectorElementType(const TargetLowering &TLI,
	MVT EltVT, MVT NewEltVT) {
	unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
	MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
	assert(TLI.isTypeLegal(MidVT) && "unexpected");
	return MidVT;
	}

	void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
	DEBUG(dbgs() << "Trying to promote node\n");
	SmallVector<SDValue, 8> Results;
	MVT OVT = Node->getSimpleValueType(0);
	if (Node->getOpcode() == ISD::UINT_TO_FP \|\|
	Node->getOpcode() == ISD::SINT_TO_FP \|\|
	Node->getOpcode() == ISD::SETCC \|\|
	Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
	OVT = Node->getOperand(0).getSimpleValueType();
	}
	if (Node->getOpcode() == ISD::BR_CC)
	OVT = Node->getOperand(2).getSimpleValueType();
	MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
	SDLoc dl(Node);
	SDValue Tmp1, Tmp2, Tmp3;
	switch (Node->getOpcode()) {
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTPOP:
	// Zero extend the argument.
	Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
	if (Node->getOpcode() == ISD::CTTZ) {
	// The count is the same in the promoted type except if the original
	// value was zero. This can be handled by setting the bit just off
	// the top of the original type.
	auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
	OVT.getSizeInBits());
	Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
	DAG.getConstant(TopBit, dl, NVT));
	}
	// Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
	// already the correct result.
	Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
	if (Node->getOpcode() == ISD::CTLZ \|\|
	Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
	// Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
	Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
	DAG.getConstant(NVT.getSizeInBits() -
	OVT.getSizeInBits(), dl, NVT));
	}
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
	break;
	case ISD::BITREVERSE:
	case ISD::BSWAP: {
	unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
	Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
	Tmp1 = DAG.getNode(
	ISD::SRL, dl, NVT, Tmp1,
	DAG.getConstant(DiffBits, dl,
	TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
	Results.push_back(Tmp1);
	break;
	}
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
	Node->getOpcode() == ISD::FP_TO_SINT, dl);
	Results.push_back(Tmp1);
	break;
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP:
	Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
	Node->getOpcode() == ISD::SINT_TO_FP, dl);
	Results.push_back(Tmp1);
	break;
	case ISD::VAARG: {
	SDValue Chain = Node->getOperand(0); // Get the chain.
	SDValue Ptr = Node->getOperand(1); // Get the pointer.

	unsigned TruncOp;
	if (OVT.isVector()) {
	TruncOp = ISD::BITCAST;
	} else {
	assert(OVT.isInteger()
	&& "VAARG promotion is supported only for vectors or integer types");
	TruncOp = ISD::TRUNCATE;
	}

	// Perform the larger operation, then convert back
	Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
	Node->getConstantOperandVal(3));
	Chain = Tmp1.getValue(1);

	Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);

	// Modified the chain result - switch anything that used the old chain to
	// use the new one.
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
	if (UpdatedNodes) {
	UpdatedNodes->insert(Tmp2.getNode());
	UpdatedNodes->insert(Chain.getNode());
	}
	ReplacedNode(Node);
	break;
	}
	case ISD::MUL:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	unsigned ExtOp, TruncOp;
	if (OVT.isVector()) {
	ExtOp = ISD::BITCAST;
	TruncOp = ISD::BITCAST;
	} else {
	assert(OVT.isInteger() && "Cannot promote logic operation");

	switch (Node->getOpcode()) {
	default:
	ExtOp = ISD::ANY_EXTEND;
	break;
	case ISD::SDIV:
	case ISD::SREM:
	ExtOp = ISD::SIGN_EXTEND;
	break;
	case ISD::UDIV:
	case ISD::UREM:
	ExtOp = ISD::ZERO_EXTEND;
	break;
	}
	TruncOp = ISD::TRUNCATE;
	}
	// Promote each of the values to the new type.
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	// Perform the larger operation, then convert back
	Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
	Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
	break;
	}
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: {
	// Promote to a multiply in a wider integer type.
	unsigned ExtOp = Node->getOpcode() == ISD::UMUL_LOHI ? ISD::ZERO_EXTEND
	: ISD::SIGN_EXTEND;
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);

	auto &DL = DAG.getDataLayout();
	unsigned OriginalSize = OVT.getScalarSizeInBits();
	Tmp2 = DAG.getNode(
	ISD::SRL, dl, NVT, Tmp1,
	DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
	break;
	}
	case ISD::SELECT: {
	unsigned ExtOp, TruncOp;
	if (Node->getValueType(0).isVector() \|\|
	Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
	ExtOp = ISD::BITCAST;
	TruncOp = ISD::BITCAST;
	} else if (Node->getValueType(0).isInteger()) {
	ExtOp = ISD::ANY_EXTEND;
	TruncOp = ISD::TRUNCATE;
	} else {
	ExtOp = ISD::FP_EXTEND;
	TruncOp = ISD::FP_ROUND;
	}
	Tmp1 = Node->getOperand(0);
	// Promote each of the values to the new type.
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
	// Perform the larger operation, then round down.
	Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
	if (TruncOp != ISD::FP_ROUND)
	Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
	else
	Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Tmp1);
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();

	// Cast the two input vectors.
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(1));

	// Convert the shuffle mask to the right # elements.
	Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, OVT, Tmp1);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::SETCC: {
	unsigned ExtOp = ISD::FP_EXTEND;
	if (NVT.isInteger()) {
	ISD::CondCode CCCode =
	cast<CondCodeSDNode>(Node->getOperand(2))->get();
	ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	}
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
	Tmp1, Tmp2, Node->getOperand(2)));
	break;
	}
	case ISD::BR_CC: {
	unsigned ExtOp = ISD::FP_EXTEND;
	if (NVT.isInteger()) {
	ISD::CondCode CCCode =
	cast<CondCodeSDNode>(Node->getOperand(1))->get();
	ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	}
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(3));
	Results.push_back(DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0),
	Node->getOperand(0), Node->getOperand(1),
	Tmp1, Tmp2, Node->getOperand(4)));
	break;
	}
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FPOW:
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
	Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
	Node->getFlags());
	Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
	Tmp3, DAG.getIntPtrConstant(0, dl)));
	break;
	case ISD::FMA:
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
	Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2));
	Results.push_back(
	DAG.getNode(ISD::FP_ROUND, dl, OVT,
	DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3),
	DAG.getIntPtrConstant(0, dl)));
	break;
	case ISD::FCOPYSIGN:
	case ISD::FPOWI: {
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = Node->getOperand(1);
	Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);

	// fcopysign doesn't change anything but the sign bit, so
	// (fp_round (fcopysign (fpext a), b))
	// is as precise as
	// (fp_round (fpext a))
	// which is a no-op. Mark it as a TRUNCating FP_ROUND.
	const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN);
	Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
	Tmp3, DAG.getIntPtrConstant(isTrunc, dl)));
	break;
	}
	case ISD::FFLOOR:
	case ISD::FCEIL:
	case ISD::FRINT:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FTRUNC:
	case ISD::FNEG:
	case ISD::FSQRT:
	case ISD::FSIN:
	case ISD::FCOS:
	case ISD::FLOG:
	case ISD::FLOG2:
	case ISD::FLOG10:
	case ISD::FABS:
	case ISD::FEXP:
	case ISD::FEXP2:
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
	Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
	Tmp2, DAG.getIntPtrConstant(0, dl)));
	break;
	case ISD::BUILD_VECTOR: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to a different vector type with the same total bit size
	//
	// e.g. v2i64 = build_vector i64:x, i64:y => v4i32
	// =>
	// v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y))

	assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
	"Invalid promote type for build_vector");
	assert(NewEltVT.bitsLT(EltVT) && "not handled");

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);

	SmallVector<SDValue, 8> NewOps;
	for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) {
	SDValue Op = Node->getOperand(I);
	NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op));
	}

	SDLoc SL(Node);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
	SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
	Results.push_back(CvtVec);
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to a different vector type with the same total bit size.
	//
	// e.g. v2i64 = extract_vector_elt x:v2i64, y:i32
	// =>
	// v4i32:castx = bitcast x:v2i64
	//
	// i64 = bitcast
	// (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
	// (i32 (extract_vector_elt castx, (2 * y + 1)))
	//

	assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
	"Invalid promote type for extract_vector_elt");
	assert(NewEltVT.bitsLT(EltVT) && "not handled");

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
	unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();

	SDValue Idx = Node->getOperand(1);
	EVT IdxVT = Idx.getValueType();
	SDLoc SL(Node);
	SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT);
	SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);

	SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));

	SmallVector<SDValue, 8> NewOps;
	for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
	SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
	SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);

	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
	CastVec, TmpIdx);
	NewOps.push_back(Elt);
	}

	SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);
	Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to a different vector type with the same total bit size
	//
	// e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32
	// =>
	// v4i32:castx = bitcast x:v2i64
	// v2i32:casty = bitcast y:i64
	//
	// v2i64 = bitcast
	// (v4i32 insert_vector_elt
	// (v4i32 insert_vector_elt v4i32:castx,
	// (extract_vector_elt casty, 0), 2 * z),
	// (extract_vector_elt casty, 1), (2 * z + 1))

	assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
	"Invalid promote type for insert_vector_elt");
	assert(NewEltVT.bitsLT(EltVT) && "not handled");

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
	unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();

	SDValue Val = Node->getOperand(1);
	SDValue Idx = Node->getOperand(2);
	EVT IdxVT = Idx.getValueType();
	SDLoc SL(Node);

	SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT);
	SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);

	SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
	SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);

	SDValue NewVec = CastVec;
	for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
	SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
	SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);

	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
	CastVal, IdxOffset);

	NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT,
	NewVec, Elt, InEltIdx);
	}

	Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec));
	break;
	}
	case ISD::SCALAR_TO_VECTOR: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to different vector type with the same total bit size.
	//
	// e.g. v2i64 = scalar_to_vector x:i64
	// =>
	// concat_vectors (v2i32 bitcast x:i64), (v2i32 undef)
	//

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
	SDValue Val = Node->getOperand(0);
	SDLoc SL(Node);

	SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
	SDValue Undef = DAG.getUNDEF(MidVT);

	SmallVector<SDValue, 8> NewElts;
	NewElts.push_back(CastVal);
	for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I)
	NewElts.push_back(Undef);

	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts);
	SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
	Results.push_back(CvtVec);
	break;
	}
	}

	// Replace the original node with the legalized result.
	if (!Results.empty()) {
	DEBUG(dbgs() << "Successfully promoted node\n");
	ReplaceNode(Node, Results.data());
	} else
	DEBUG(dbgs() << "Could not promote node\n");
	}

	/// This is the entry point for the file.
	void SelectionDAG::Legalize() {
	AssignTopologicalOrder();

	SmallPtrSet<SDNode *, 16> LegalizedNodes;
	// Use a delete listener to remove nodes which were deleted during
	// legalization from LegalizeNodes. This is needed to handle the situation
	// where a new node is allocated by the object pool to the same address of a
	// previously deleted node.
	DAGNodeDeletedListener DeleteListener(
	*this,
	[&LegalizedNodes](SDNode N, SDNode E) { LegalizedNodes.erase(N); });

	SelectionDAGLegalize Legalizer(*this, LegalizedNodes);

	// Visit all the nodes. We start in topological order, so that we see
	// nodes with their original operands intact. Legalization can produce
	// new nodes which may themselves need to be legalized. Iterate until all
	// nodes have been legalized.
	while (true) {
	bool AnyLegalized = false;
	for (auto NI = allnodes_end(); NI != allnodes_begin();) {
	--NI;

	SDNode N = &NI;
	if (N->use_empty() && N != getRoot().getNode()) {
	++NI;
	DeleteNode(N);
	continue;
	}

	if (LegalizedNodes.insert(N).second) {
	AnyLegalized = true;
	Legalizer.LegalizeOp(N);

	if (N->use_empty() && N != getRoot().getNode()) {
	++NI;
	DeleteNode(N);
	}
	}
	}
	if (!AnyLegalized)
	break;

	}

	// Remove dead nodes now.
	RemoveDeadNodes();
	}

	bool SelectionDAG::LegalizeOp(SDNode *N,
	SmallSetVector<SDNode *, 16> &UpdatedNodes) {
	SmallPtrSet<SDNode *, 16> LegalizedNodes;
	SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);

	// Directly insert the node in question, and legalize it. This will recurse
	// as needed through operands.
	LegalizedNodes.insert(N);
	Legalizer.LegalizeOp(N);

	return LegalizedNodes.count(N);
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 328362)
	@@ -1,8275 +1,8272 @@
	//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements the SelectionDAG class.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/SelectionDAG.h"
	#include "SDNodeDbgValue.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/Mutex.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <limits>
	#include <set>
	#include <string>
	#include <utility>
	#include <vector>

	using namespace llvm;

	/// makeVTList - Return an instance of the SDVTList struct initialized with the
	/// specified members.
	static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
	SDVTList Res = {VTs, NumVTs};
	return Res;
	}

	// Default null implementations of the callbacks.
	void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode, SDNode) {}
	void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}

	#define DEBUG_TYPE "selectiondag"

	static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
	DEBUG(
	dbgs() << Msg;
	V.getNode()->dump(G);
	);
	}

	//===----------------------------------------------------------------------===//
	// ConstantFPSDNode Class
	//===----------------------------------------------------------------------===//

	/// isExactlyValue - We don't rely on operator== working on double values, as
	/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
	/// As such, this method can be used to do an exact bit-for-bit comparison of
	/// two floating point values.
	bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
	return getValueAPF().bitwiseIsEqual(V);
	}

	bool ConstantFPSDNode::isValueValidForType(EVT VT,
	const APFloat& Val) {
	assert(VT.isFloatingPoint() && "Can only convert between FP types");

	// convert modifies in place, so make a copy.
	APFloat Val2 = APFloat(Val);
	bool losesInfo;
	(void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven,
	&losesInfo);
	return !losesInfo;
	}

	//===----------------------------------------------------------------------===//
	// ISD Namespace
	//===----------------------------------------------------------------------===//

	bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
	auto *BV = dyn_cast<BuildVectorSDNode>(N);
	if (!BV)
	return false;

	APInt SplatUndef;
	unsigned SplatBitSize;
	bool HasUndefs;
	unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
	return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
	EltSize) &&
	EltSize == SplatBitSize;
	}

	// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
	// specializations of the more general isConstantSplatVector()?

	bool ISD::isBuildVectorAllOnes(const SDNode *N) {
	// Look through a bit convert.
	while (N->getOpcode() == ISD::BITCAST)
	N = N->getOperand(0).getNode();

	if (N->getOpcode() != ISD::BUILD_VECTOR) return false;

	unsigned i = 0, e = N->getNumOperands();

	// Skip over all of the undef values.
	while (i != e && N->getOperand(i).isUndef())
	++i;

	// Do not accept an all-undef vector.
	if (i == e) return false;

	// Do not accept build_vectors that aren't all constants or which have non-~0
	// elements. We have to be a bit careful here, as the type of the constant
	// may not be the same as the type of the vector elements due to type
	// legalization (the elements are promoted to a legal type for the target and
	// a vector of a type may be legal when the base element type is not).
	// We only want to check enough bits to cover the vector elements, because
	// we care if the resultant vector is all ones, not whether the individual
	// constants are.
	SDValue NotZero = N->getOperand(i);
	unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
	if (CN->getAPIntValue().countTrailingOnes() < EltSize)
	return false;
	} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
	if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
	return false;
	} else
	return false;

	// Okay, we have at least one ~0 value, check to see if the rest match or are
	// undefs. Even with the above element type twiddling, this should be OK, as
	// the same type legalization should have applied to all the elements.
	for (++i; i != e; ++i)
	if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
	return false;
	return true;
	}

	bool ISD::isBuildVectorAllZeros(const SDNode *N) {
	// Look through a bit convert.
	while (N->getOpcode() == ISD::BITCAST)
	N = N->getOperand(0).getNode();

	if (N->getOpcode() != ISD::BUILD_VECTOR) return false;

	bool IsAllUndef = true;
	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	IsAllUndef = false;
	// Do not accept build_vectors that aren't all constants or which have non-0
	// elements. We have to be a bit careful here, as the type of the constant
	// may not be the same as the type of the vector elements due to type
	// legalization (the elements are promoted to a legal type for the target
	// and a vector of a type may be legal when the base element type is not).
	// We only want to check enough bits to cover the vector elements, because
	// we care if the resultant vector is all zeros, not whether the individual
	// constants are.
	unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
	if (CN->getAPIntValue().countTrailingZeros() < EltSize)
	return false;
	} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
	if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
	return false;
	} else
	return false;
	}

	// Do not accept an all-undef vector.
	if (IsAllUndef)
	return false;
	return true;
	}

	bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	if (!isa<ConstantSDNode>(Op))
	return false;
	}
	return true;
	}

	bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	if (!isa<ConstantFPSDNode>(Op))
	return false;
	}
	return true;
	}

	bool ISD::allOperandsUndef(const SDNode *N) {
	// Return false if the node has no operands.
	// This is "logically inconsistent" with the definition of "all" but
	// is probably the desired behavior.
	if (N->getNumOperands() == 0)
	return false;

	for (const SDValue &Op : N->op_values())
	if (!Op.isUndef())
	return false;

	return true;
	}

	ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
	switch (ExtType) {
	case ISD::EXTLOAD:
	return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
	case ISD::SEXTLOAD:
	return ISD::SIGN_EXTEND;
	case ISD::ZEXTLOAD:
	return ISD::ZERO_EXTEND;
	default:
	break;
	}

	llvm_unreachable("Invalid LoadExtType");
	}

	ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
	// To perform this operation, we just need to swap the L and G bits of the
	// operation.
	unsigned OldL = (Operation >> 2) & 1;
	unsigned OldG = (Operation >> 1) & 1;
	return ISD::CondCode((Operation & ~6) \| // Keep the N, U, E bits
	(OldL << 1) \| // New G bit
	(OldG << 2)); // New L bit.
	}

	ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
	unsigned Operation = Op;
	if (isInteger)
	Operation ^= 7; // Flip L, G, E bits, but not U.
	else
	Operation ^= 15; // Flip all of the condition bits.

	if (Operation > ISD::SETTRUE2)
	Operation &= ~8; // Don't let N and U bits get set.

	return ISD::CondCode(Operation);
	}

	/// For an integer comparison, return 1 if the comparison is a signed operation
	/// and 2 if the result is an unsigned comparison. Return zero if the operation
	/// does not depend on the sign of the input (setne and seteq).
	static int isSignedOp(ISD::CondCode Opcode) {
	switch (Opcode) {
	default: llvm_unreachable("Illegal integer setcc operation!");
	case ISD::SETEQ:
	case ISD::SETNE: return 0;
	case ISD::SETLT:
	case ISD::SETLE:
	case ISD::SETGT:
	case ISD::SETGE: return 1;
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE: return 2;
	}
	}

	ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
	bool IsInteger) {
	if (IsInteger && (isSignedOp(Op1) \| isSignedOp(Op2)) == 3)
	// Cannot fold a signed integer setcc with an unsigned integer setcc.
	return ISD::SETCC_INVALID;

	unsigned Op = Op1 \| Op2; // Combine all of the condition bits.

	// If the N and U bits get set, then the resultant comparison DOES suddenly
	// care about orderedness, and it is true when ordered.
	if (Op > ISD::SETTRUE2)
	Op &= ~16; // Clear the U bit if the N bit is set.

	// Canonicalize illegal integer setcc's.
	if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT \| SETULT
	Op = ISD::SETNE;

	return ISD::CondCode(Op);
	}

	ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
	bool IsInteger) {
	if (IsInteger && (isSignedOp(Op1) \| isSignedOp(Op2)) == 3)
	// Cannot fold a signed setcc with an unsigned setcc.
	return ISD::SETCC_INVALID;

	// Combine all of the condition bits.
	ISD::CondCode Result = ISD::CondCode(Op1 & Op2);

	// Canonicalize illegal integer setcc's.
	if (IsInteger) {
	switch (Result) {
	default: break;
	case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT
	case ISD::SETOEQ: // SETEQ & SETU[LG]E
	case ISD::SETUEQ: Result = ISD::SETEQ ; break; // SETUGE & SETULE
	case ISD::SETOLT: Result = ISD::SETULT ; break; // SETULT & SETNE
	case ISD::SETOGT: Result = ISD::SETUGT ; break; // SETUGT & SETNE
	}
	}

	return Result;
	}

	//===----------------------------------------------------------------------===//
	// SDNode Profile Support
	//===----------------------------------------------------------------------===//

	/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
	static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) {
	ID.AddInteger(OpC);
	}

	/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
	/// solely with their pointer.
	static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
	ID.AddPointer(VTList.VTs);
	}

	/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
	static void AddNodeIDOperands(FoldingSetNodeID &ID,
	ArrayRef<SDValue> Ops) {
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	}

	/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
	static void AddNodeIDOperands(FoldingSetNodeID &ID,
	ArrayRef<SDUse> Ops) {
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	}

	static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
	SDVTList VTList, ArrayRef<SDValue> OpList) {
	AddNodeIDOpcode(ID, OpC);
	AddNodeIDValueTypes(ID, VTList);
	AddNodeIDOperands(ID, OpList);
	}

	/// If this is an SDNode with special info, add this info to the NodeID data.
	static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
	switch (N->getOpcode()) {
	case ISD::TargetExternalSymbol:
	case ISD::ExternalSymbol:
	case ISD::MCSymbol:
	llvm_unreachable("Should only be used on nodes with operands");
	default: break; // Normal nodes don't need extra info.
	case ISD::TargetConstant:
	case ISD::Constant: {
	const ConstantSDNode *C = cast<ConstantSDNode>(N);
	ID.AddPointer(C->getConstantIntValue());
	ID.AddBoolean(C->isOpaque());
	break;
	}
	case ISD::TargetConstantFP:
	case ISD::ConstantFP:
	ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
	break;
	case ISD::TargetGlobalAddress:
	case ISD::GlobalAddress:
	case ISD::TargetGlobalTLSAddress:
	case ISD::GlobalTLSAddress: {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
	ID.AddPointer(GA->getGlobal());
	ID.AddInteger(GA->getOffset());
	ID.AddInteger(GA->getTargetFlags());
	break;
	}
	case ISD::BasicBlock:
	ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
	break;
	case ISD::Register:
	ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
	break;
	case ISD::RegisterMask:
	ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
	break;
	case ISD::SRCVALUE:
	ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
	break;
	case ISD::FrameIndex:
	case ISD::TargetFrameIndex:
	ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
	break;
	case ISD::JumpTable:
	case ISD::TargetJumpTable:
	ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
	ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
	break;
	case ISD::ConstantPool:
	case ISD::TargetConstantPool: {
	const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
	ID.AddInteger(CP->getAlignment());
	ID.AddInteger(CP->getOffset());
	if (CP->isMachineConstantPoolEntry())
	CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
	else
	ID.AddPointer(CP->getConstVal());
	ID.AddInteger(CP->getTargetFlags());
	break;
	}
	case ISD::TargetIndex: {
	const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
	ID.AddInteger(TI->getIndex());
	ID.AddInteger(TI->getOffset());
	ID.AddInteger(TI->getTargetFlags());
	break;
	}
	case ISD::LOAD: {
	const LoadSDNode *LD = cast<LoadSDNode>(N);
	ID.AddInteger(LD->getMemoryVT().getRawBits());
	ID.AddInteger(LD->getRawSubclassData());
	ID.AddInteger(LD->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::STORE: {
	const StoreSDNode *ST = cast<StoreSDNode>(N);
	ID.AddInteger(ST->getMemoryVT().getRawBits());
	ID.AddInteger(ST->getRawSubclassData());
	ID.AddInteger(ST->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::ATOMIC_CMP_SWAP:
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD:
	case ISD::ATOMIC_STORE: {
	const AtomicSDNode *AT = cast<AtomicSDNode>(N);
	ID.AddInteger(AT->getMemoryVT().getRawBits());
	ID.AddInteger(AT->getRawSubclassData());
	ID.AddInteger(AT->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::PREFETCH: {
	const MemSDNode *PF = cast<MemSDNode>(N);
	ID.AddInteger(PF->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
	i != e; ++i)
	ID.AddInteger(SVN->getMaskElt(i));
	break;
	}
	case ISD::TargetBlockAddress:
	case ISD::BlockAddress: {
	const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
	ID.AddPointer(BA->getBlockAddress());
	ID.AddInteger(BA->getOffset());
	ID.AddInteger(BA->getTargetFlags());
	break;
	}
	} // end switch (N->getOpcode())

	// Target specific memory nodes could also have address spaces to check.
	if (N->isTargetMemoryOpcode())
	ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
	}

	/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
	/// data.
	static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
	AddNodeIDOpcode(ID, N->getOpcode());
	// Add the return value info.
	AddNodeIDValueTypes(ID, N->getVTList());
	// Add the operand info.
	AddNodeIDOperands(ID, N->ops());

	// Handle SDNode leafs with special info.
	AddNodeIDCustom(ID, N);
	}

	//===----------------------------------------------------------------------===//
	// SelectionDAG Class
	//===----------------------------------------------------------------------===//

	/// doNotCSE - Return true if CSE should not be performed for this node.
	static bool doNotCSE(SDNode *N) {
	if (N->getValueType(0) == MVT::Glue)
	return true; // Never CSE anything that produces a flag.

	switch (N->getOpcode()) {
	default: break;
	case ISD::HANDLENODE:
	case ISD::EH_LABEL:
	return true; // Never CSE these nodes.
	}

	// Check that remaining values produced are not flags.
	for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
	if (N->getValueType(i) == MVT::Glue)
	return true; // Never CSE anything that produces a flag.

	return false;
	}

	/// RemoveDeadNodes - This method deletes all unreachable nodes in the
	/// SelectionDAG.
	void SelectionDAG::RemoveDeadNodes() {
	// Create a dummy node (which is not added to allnodes), that adds a reference
	// to the root node, preventing it from being deleted.
	HandleSDNode Dummy(getRoot());

	SmallVector<SDNode*, 128> DeadNodes;

	// Add all obviously-dead nodes to the DeadNodes worklist.
	for (SDNode &Node : allnodes())
	if (Node.use_empty())
	DeadNodes.push_back(&Node);

	RemoveDeadNodes(DeadNodes);

	// If the root changed (e.g. it was a dead load, update the root).
	setRoot(Dummy.getValue());
	}

	/// RemoveDeadNodes - This method deletes the unreachable nodes in the
	/// given list, and any nodes that become unreachable as a result.
	void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {

	// Process the worklist, deleting the nodes and adding their uses to the
	// worklist.
	while (!DeadNodes.empty()) {
	SDNode *N = DeadNodes.pop_back_val();
	// Skip to next node if we've already managed to delete the node. This could
	// happen if replacing a node causes a node previously added to the node to
	// be deleted.
	if (N->getOpcode() == ISD::DELETED_NODE)
	continue;

	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeDeleted(N, nullptr);

	// Take the node out of the appropriate CSE map.
	RemoveNodeFromCSEMaps(N);

	// Next, brutally remove the operand list. This is safe to do, as there are
	// no cycles in the graph.
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
	SDUse &Use = *I++;
	SDNode *Operand = Use.getNode();
	Use.set(SDValue());

	// Now that we removed this operand, see if there are no uses of it left.
	if (Operand->use_empty())
	DeadNodes.push_back(Operand);
	}

	DeallocateNode(N);
	}
	}

	void SelectionDAG::RemoveDeadNode(SDNode *N){
	SmallVector<SDNode*, 16> DeadNodes(1, N);

	// Create a dummy node that adds a reference to the root node, preventing
	// it from being deleted. (This matters if the root is an operand of the
	// dead node.)
	HandleSDNode Dummy(getRoot());

	RemoveDeadNodes(DeadNodes);
	}

	void SelectionDAG::DeleteNode(SDNode *N) {
	// First take this out of the appropriate CSE map.
	RemoveNodeFromCSEMaps(N);

	// Finally, remove uses due to operands of this node, remove from the
	// AllNodes list, and delete the node.
	DeleteNodeNotInCSEMaps(N);
	}

	void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
	assert(N->getIterator() != AllNodes.begin() &&
	"Cannot delete the entry node!");
	assert(N->use_empty() && "Cannot delete a node that is not dead!");

	// Drop all of the operands and decrement used node's use counts.
	N->DropOperands();

	DeallocateNode(N);
	}

	void SDDbgInfo::erase(const SDNode *Node) {
	DbgValMapType::iterator I = DbgValMap.find(Node);
	if (I == DbgValMap.end())
	return;
	for (auto &Val: I->second)
	Val->setIsInvalidated();
	DbgValMap.erase(I);
	}

	void SelectionDAG::DeallocateNode(SDNode *N) {
	// If we have operands, deallocate them.
	removeOperands(N);

	NodeAllocator.Deallocate(AllNodes.remove(N));

	// Set the opcode to DELETED_NODE to help catch bugs when node
	// memory is reallocated.
	// FIXME: There are places in SDag that have grown a dependency on the opcode
	// value in the released node.
	__asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
	N->NodeType = ISD::DELETED_NODE;

	// If any of the SDDbgValue nodes refer to this SDNode, invalidate
	// them and forget about that node.
	DbgInfo->erase(N);
	}

	#ifndef NDEBUG
	/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid.
	static void VerifySDNode(SDNode *N) {
	switch (N->getOpcode()) {
	default:
	break;
	case ISD::BUILD_PAIR: {
	EVT VT = N->getValueType(0);
	assert(N->getNumValues() == 1 && "Too many results!");
	assert(!VT.isVector() && (VT.isInteger() \|\| VT.isFloatingPoint()) &&
	"Wrong return type!");
	assert(N->getNumOperands() == 2 && "Wrong number of operands!");
	assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
	"Mismatched operand types!");
	assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
	"Wrong operand type!");
	assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
	"Wrong return type size");
	break;
	}
	case ISD::BUILD_VECTOR: {
	assert(N->getNumValues() == 1 && "Too many results!");
	assert(N->getValueType(0).isVector() && "Wrong return type!");
	assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
	"Wrong number of operands!");
	EVT EltVT = N->getValueType(0).getVectorElementType();
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
	assert((I->getValueType() == EltVT \|\|
	(EltVT.isInteger() && I->getValueType().isInteger() &&
	EltVT.bitsLE(I->getValueType()))) &&
	"Wrong operand type!");
	assert(I->getValueType() == N->getOperand(0).getValueType() &&
	"Operands must all have the same type");
	}
	break;
	}
	}
	}
	#endif // NDEBUG

	/// \brief Insert a newly allocated node into the DAG.
	///
	/// Handles insertion into the all nodes list and CSE map, as well as
	/// verification and other common operations when a new node is allocated.
	void SelectionDAG::InsertNode(SDNode *N) {
	AllNodes.push_back(N);
	#ifndef NDEBUG
	N->PersistentId = NextPersistentId++;
	VerifySDNode(N);
	#endif
	}

	/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
	/// correspond to it. This is useful when we're about to delete or repurpose
	/// the node. We don't want future request for structurally identical nodes
	/// to return N anymore.
	bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
	bool Erased = false;
	switch (N->getOpcode()) {
	case ISD::HANDLENODE: return false; // noop.
	case ISD::CONDCODE:
	assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
	"Cond code doesn't exist!");
	Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
	CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
	break;
	case ISD::ExternalSymbol:
	Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
	break;
	case ISD::TargetExternalSymbol: {
	ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
	Erased = TargetExternalSymbols.erase(
	std::pair<std::string,unsigned char>(ESN->getSymbol(),
	ESN->getTargetFlags()));
	break;
	}
	case ISD::MCSymbol: {
	auto *MCSN = cast<MCSymbolSDNode>(N);
	Erased = MCSymbols.erase(MCSN->getMCSymbol());
	break;
	}
	case ISD::VALUETYPE: {
	EVT VT = cast<VTSDNode>(N)->getVT();
	if (VT.isExtended()) {
	Erased = ExtendedValueTypeNodes.erase(VT);
	} else {
	Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
	ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
	}
	break;
	}
	default:
	// Remove it from the CSE Map.
	assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
	assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
	Erased = CSEMap.RemoveNode(N);
	break;
	}
	#ifndef NDEBUG
	// Verify that the node was actually in one of the CSE maps, unless it has a
	// flag result (which cannot be CSE'd) or is one of the special cases that are
	// not subject to CSE.
	if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
	!N->isMachineOpcode() && !doNotCSE(N)) {
	N->dump(this);
	dbgs() << "\n";
	llvm_unreachable("Node is not in map!");
	}
	#endif
	return Erased;
	}

	/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
	/// maps and modified in place. Add it back to the CSE maps, unless an identical
	/// node already exists, in which case transfer all its users to the existing
	/// node. This transfer can potentially trigger recursive merging.
	void
	SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
	// For node types that aren't CSE'd, just act as if no identical node
	// already exists.
	if (!doNotCSE(N)) {
	SDNode *Existing = CSEMap.GetOrInsertNode(N);
	if (Existing != N) {
	// If there was already an existing matching node, use ReplaceAllUsesWith
	// to replace the dead one with the existing one. This can cause
	// recursive merging of other unrelated nodes down the line.
	ReplaceAllUsesWith(N, Existing);

	// N is now dead. Inform the listeners and delete it.
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeDeleted(N, Existing);
	DeleteNodeNotInCSEMaps(N);
	return;
	}
	}

	// If the node doesn't already exist, we updated it. Inform listeners.
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeUpdated(N);
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N, SDValue Op,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	SDValue Ops[] = { Op };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N,
	SDValue Op1, SDValue Op2,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	SDValue Ops[] = { Op1, Op2 };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N, ArrayRef<SDValue> Ops,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
	Type *Ty = VT == MVT::iPTR ?
	PointerType::get(Type::getInt8Ty(*getContext()), 0) :
	VT.getTypeForEVT(*getContext());

	return getDataLayout().getABITypeAlignment(Ty);
	}

	// EntryNode could meaningfully have debug info if we can find it...
	SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
	: TM(tm), OptLevel(OL),
	EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
	Root(getEntryNode()) {
	InsertNode(&EntryNode);
	DbgInfo = new SDDbgInfo();
	}

	void SelectionDAG::init(MachineFunction &NewMF,
	OptimizationRemarkEmitter &NewORE,
	Pass *PassPtr) {
	MF = &NewMF;
	SDAGISelPass = PassPtr;
	ORE = &NewORE;
	TLI = getSubtarget().getTargetLowering();
	TSI = getSubtarget().getSelectionDAGInfo();
	Context = &MF->getFunction().getContext();
	}

	SelectionDAG::~SelectionDAG() {
	assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
	allnodes_clear();
	OperandRecycler.clear(OperandAllocator);
	delete DbgInfo;
	}

	void SelectionDAG::allnodes_clear() {
	assert(&*AllNodes.begin() == &EntryNode);
	AllNodes.remove(AllNodes.begin());
	while (!AllNodes.empty())
	DeallocateNode(&AllNodes.front());
	#ifndef NDEBUG
	NextPersistentId = 0;
	#endif
	}

	SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
	void *&InsertPos) {
	SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
	if (N) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::Constant:
	case ISD::ConstantFP:
	llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
	"debug location. Use another overload.");
	}
	}
	return N;
	}

	SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
	const SDLoc &DL, void *&InsertPos) {
	SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
	if (N) {
	switch (N->getOpcode()) {
	case ISD::Constant:
	case ISD::ConstantFP:
	// Erase debug location from the node if the node is used at several
	// different places. Do not propagate one location to all uses as it
	// will cause a worse single stepping debugging experience.
	if (N->getDebugLoc() != DL.getDebugLoc())
	N->setDebugLoc(DebugLoc());
	break;
	default:
	// When the node's point of use is located earlier in the instruction
	// sequence than its prior point of use, update its debug info to the
	// earlier location.
	if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
	N->setDebugLoc(DL.getDebugLoc());
	break;
	}
	}
	return N;
	}

	void SelectionDAG::clear() {
	allnodes_clear();
	OperandRecycler.clear(OperandAllocator);
	OperandAllocator.Reset();
	CSEMap.clear();

	ExtendedValueTypeNodes.clear();
	ExternalSymbols.clear();
	TargetExternalSymbols.clear();
	MCSymbols.clear();
	std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
	static_cast<CondCodeSDNode*>(nullptr));
	std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
	static_cast<SDNode*>(nullptr));

	EntryNode.UseList = nullptr;
	InsertNode(&EntryNode);
	Root = getEntryNode();
	DbgInfo->clear();
	}

	SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType())
	? getNode(ISD::FP_EXTEND, DL, VT, Op)
	: getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
	}

	SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::ANY_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
	EVT OpVT) {
	if (VT.bitsLE(Op.getValueType()))
	return getNode(ISD::TRUNCATE, SL, VT, Op);

	TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
	return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
	}

	SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
	assert(!VT.isVector() &&
	"getZeroExtendInReg should use the vector element type instead of "
	"the vector type!");
	if (Op.getValueType().getScalarType() == VT) return Op;
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	APInt Imm = APInt::getLowBitsSet(BitWidth,
	VT.getSizeInBits());
	return getNode(ISD::AND, DL, Op.getValueType(), Op,
	getConstant(Imm, DL, Op.getValueType()));
	}

	SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
	EVT VT) {
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
	"The sizes of the input and result must match in order to perform the "
	"extend in-register.");
	assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
	}

	SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
	EVT VT) {
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
	"The sizes of the input and result must match in order to perform the "
	"extend in-register.");
	assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
	}

	SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
	EVT VT) {
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
	"The sizes of the input and result must match in order to perform the "
	"extend in-register.");
	assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
	}

	/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
	SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
	EVT EltVT = VT.getScalarType();
	SDValue NegOne =
	getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
	return getNode(ISD::XOR, DL, VT, Val, NegOne);
	}

	SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
	EVT EltVT = VT.getScalarType();
	SDValue TrueValue;
	switch (TLI->getBooleanContents(VT)) {
	case TargetLowering::ZeroOrOneBooleanContent:
	case TargetLowering::UndefinedBooleanContent:
	TrueValue = getConstant(1, DL, VT);
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL,
	VT);
	break;
	}
	return getNode(ISD::XOR, DL, VT, Val, TrueValue);
	}

	SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isT, bool isO) {
	EVT EltVT = VT.getScalarType();
	assert((EltVT.getSizeInBits() >= 64 \|\|
	(uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
	"getConstant with a uint64_t value that doesn't fit in the type!");
	return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
	}

	SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isT, bool isO) {
	return getConstant(ConstantInt::get(Context, Val), DL, VT, isT, isO);
	}

	SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
	EVT VT, bool isT, bool isO) {
	assert(VT.isInteger() && "Cannot create FP integer constant!");

	EVT EltVT = VT.getScalarType();
	const ConstantInt *Elt = &Val;

	// In some cases the vector type is legal but the element type is illegal and
	// needs to be promoted, for example v8i8 on ARM. In this case, promote the
	// inserted value (the type does not need to match the vector element type).
	// Any extra bits introduced will be truncated away.
	if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
	TargetLowering::TypePromoteInteger) {
	EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
	APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
	Elt = ConstantInt::get(*getContext(), NewVal);
	}
	// In other cases the element type is illegal and needs to be expanded, for
	// example v2i64 on MIPS32. In this case, find the nearest legal type, split
	// the value into n parts and use a vector type with n-times the elements.
	// Then bitcast to the type requested.
	// Legalizing constants too early makes the DAGCombiner's job harder so we
	// only legalize if the DAG tells us we must produce legal types.
	else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
	TLI->getTypeAction(*getContext(), EltVT) ==
	TargetLowering::TypeExpandInteger) {
	const APInt &NewVal = Elt->getValue();
	EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
	unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
	unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
	EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);

	// Check the temporary vector is the correct size. If this fails then
	// getTypeToTransformTo() probably returned a type whose size (in bits)
	// isn't a power-of-2 factor of the requested type size.
	assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());

	SmallVector<SDValue, 2> EltParts;
	for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
	EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
	.zextOrTrunc(ViaEltSizeInBits), DL,
	ViaEltVT, isT, isO));
	}

	// EltParts is currently in little endian order. If we actually want
	// big-endian order then reverse it now.
	if (getDataLayout().isBigEndian())
	std::reverse(EltParts.begin(), EltParts.end());

	// The elements must be reversed when the element order is different
	// to the endianness of the elements (because the BITCAST is itself a
	// vector shuffle in this situation). However, we do not need any code to
	// perform this reversal because getConstant() is producing a vector
	// splat.
	// This situation occurs in MIPS MSA.

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
	Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());

	SDValue V = getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
	return V;
	}

	assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
	"APInt size does not match type size!");
	unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
	ID.AddPointer(Elt);
	ID.AddBoolean(isO);
	void *IP = nullptr;
	SDNode *N = nullptr;
	if ((N = FindNodeOrInsertPos(ID, DL, IP)))
	if (!VT.isVector())
	return SDValue(N, 0);

	if (!N) {
	N = newSDNode<ConstantSDNode>(isT, isO, Elt, DL.getDebugLoc(), EltVT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
	}

	SDValue Result(N, 0);
	if (VT.isVector())
	Result = getSplatBuildVector(VT, DL, Result);

	return Result;
	}

	SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
	bool isTarget) {
	return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
	}

	SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
	bool isTarget) {
	return getConstantFP(ConstantFP::get(getContext(), V), DL, VT, isTarget);
	}

	SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
	EVT VT, bool isTarget) {
	assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");

	EVT EltVT = VT.getScalarType();

	// Do the map lookup using the actual bit pattern for the floating point
	// value, so that we don't have problems with 0.0 comparing equal to -0.0, and
	// we don't have issues with SNANs.
	unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
	ID.AddPointer(&V);
	void *IP = nullptr;
	SDNode *N = nullptr;
	if ((N = FindNodeOrInsertPos(ID, DL, IP)))
	if (!VT.isVector())
	return SDValue(N, 0);

	if (!N) {
	N = newSDNode<ConstantFPSDNode>(isTarget, &V, DL.getDebugLoc(), EltVT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	}

	SDValue Result(N, 0);
	if (VT.isVector())
	Result = getSplatBuildVector(VT, DL, Result);
	NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
	return Result;
	}

	SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
	bool isTarget) {
	EVT EltVT = VT.getScalarType();
	if (EltVT == MVT::f32)
	return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
	else if (EltVT == MVT::f64)
	return getConstantFP(APFloat(Val), DL, VT, isTarget);
	else if (EltVT == MVT::f80 \|\| EltVT == MVT::f128 \|\| EltVT == MVT::ppcf128 \|\|
	EltVT == MVT::f16) {
	bool Ignored;
	APFloat APF = APFloat(Val);
	APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
	&Ignored);
	return getConstantFP(APF, DL, VT, isTarget);
	} else
	llvm_unreachable("Unsupported type in getConstantFP");
	}

	SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
	EVT VT, int64_t Offset, bool isTargetGA,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTargetGA) &&
	"Cannot set target flags on target-independent globals");

	// Truncate (with sign-extension) the offset value to the pointer size.
	unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
	if (BitWidth < 64)
	Offset = SignExtend64(Offset, BitWidth);

	unsigned Opc;
	if (GV->isThreadLocal())
	Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
	else
	Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddPointer(GV);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<GlobalAddressSDNode>(
	Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
	unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(FI);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent jump tables");
	unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(JTI);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
	unsigned Alignment, int Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent globals");
	if (Alignment == 0)
	Alignment = MF->getFunction().optForSize()
	? getDataLayout().getABITypeAlignment(C->getType())
	: getDataLayout().getPrefTypeAlignment(C->getType());
	unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(Alignment);
	ID.AddInteger(Offset);
	ID.AddPointer(C);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
	TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
	unsigned Alignment, int Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent globals");
	if (Alignment == 0)
	Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
	unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(Alignment);
	ID.AddInteger(Offset);
	C->addSelectionDAGCSEId(ID);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
	TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
	unsigned char TargetFlags) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
	ID.AddInteger(Index);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
	ID.AddPointer(MBB);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<BasicBlockSDNode>(MBB);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getValueType(EVT VT) {
	if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
	ValueTypeNodes.size())
	ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);

	SDNode *&N = VT.isExtended() ?
	ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];

	if (N) return SDValue(N, 0);
	N = newSDNode<VTSDNode>(VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
	SDNode *&N = ExternalSymbols[Sym];
	if (N) return SDValue(N, 0);
	N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
	SDNode *&N = MCSymbols[Sym];
	if (N)
	return SDValue(N, 0);
	N = newSDNode<MCSymbolSDNode>(Sym, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
	unsigned char TargetFlags) {
	SDNode *&N =
	TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
	TargetFlags)];
	if (N) return SDValue(N, 0);
	N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
	if ((unsigned)Cond >= CondCodeNodes.size())
	CondCodeNodes.resize(Cond+1);

	if (!CondCodeNodes[Cond]) {
	auto *N = newSDNode<CondCodeSDNode>(Cond);
	CondCodeNodes[Cond] = N;
	InsertNode(N);
	}

	return SDValue(CondCodeNodes[Cond], 0);
	}

	/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
	/// point at N1 to point at N2 and indices that point at N2 to point at N1.
	static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
	std::swap(N1, N2);
	ShuffleVectorSDNode::commuteMask(M);
	}

	SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
	SDValue N2, ArrayRef<int> Mask) {
	assert(VT.getVectorNumElements() == Mask.size() &&
	"Must have the same number of vector elements as mask elements!");
	assert(VT == N1.getValueType() && VT == N2.getValueType() &&
	"Invalid VECTOR_SHUFFLE");

	// Canonicalize shuffle undef, undef -> undef
	if (N1.isUndef() && N2.isUndef())
	return getUNDEF(VT);

	// Validate that all indices in Mask are within the range of the elements
	// input to the shuffle.
	int NElts = Mask.size();
	assert(llvm::all_of(Mask,
	[&](int M) { return M < (NElts * 2) && M >= -1; }) &&
	"Index out of range");

	// Copy the mask so we can do any needed cleanup.
	SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());

	// Canonicalize shuffle v, v -> v, undef
	if (N1 == N2) {
	N2 = getUNDEF(VT);
	for (int i = 0; i != NElts; ++i)
	if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
	}

	// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
	if (N1.isUndef())
	commuteShuffle(N1, N2, MaskVec);

	// If shuffling a splat, try to blend the splat instead. We do this here so
	// that even when this arises during lowering we don't have to re-handle it.
	auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
	BitVector UndefElements;
	SDValue Splat = BV->getSplatValue(&UndefElements);
	if (!Splat)
	return;

	for (int i = 0; i < NElts; ++i) {
	if (MaskVec[i] < Offset \|\| MaskVec[i] >= (Offset + NElts))
	continue;

	// If this input comes from undef, mark it as such.
	if (UndefElements[MaskVec[i] - Offset]) {
	MaskVec[i] = -1;
	continue;
	}

	// If we can blend a non-undef lane, use that instead.
	if (!UndefElements[i])
	MaskVec[i] = i + Offset;
	}
	};
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	BlendSplat(N1BV, 0);
	if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
	BlendSplat(N2BV, NElts);

	// Canonicalize all index into lhs, -> shuffle lhs, undef
	// Canonicalize all index into rhs, -> shuffle rhs, undef
	bool AllLHS = true, AllRHS = true;
	bool N2Undef = N2.isUndef();
	for (int i = 0; i != NElts; ++i) {
	if (MaskVec[i] >= NElts) {
	if (N2Undef)
	MaskVec[i] = -1;
	else
	AllLHS = false;
	} else if (MaskVec[i] >= 0) {
	AllRHS = false;
	}
	}
	if (AllLHS && AllRHS)
	return getUNDEF(VT);
	if (AllLHS && !N2Undef)
	N2 = getUNDEF(VT);
	if (AllRHS) {
	N1 = getUNDEF(VT);
	commuteShuffle(N1, N2, MaskVec);
	}
	// Reset our undef status after accounting for the mask.
	N2Undef = N2.isUndef();
	// Re-check whether both sides ended up undef.
	if (N1.isUndef() && N2Undef)
	return getUNDEF(VT);

	// If Identity shuffle return that node.
	bool Identity = true, AllSame = true;
	for (int i = 0; i != NElts; ++i) {
	if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
	if (MaskVec[i] != MaskVec[0]) AllSame = false;
	}
	if (Identity && NElts)
	return N1;

	// Shuffling a constant splat doesn't change the result.
	if (N2Undef) {
	SDValue V = N1;

	// Look through any bitcasts. We check that these don't change the number
	// (and size) of elements and just changes their types.
	while (V.getOpcode() == ISD::BITCAST)
	V = V->getOperand(0);

	// A splat should always show up as a build vector node.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
	BitVector UndefElements;
	SDValue Splat = BV->getSplatValue(&UndefElements);
	// If this is a splat of an undef, shuffling it is also undef.
	if (Splat && Splat.isUndef())
	return getUNDEF(VT);

	bool SameNumElts =
	V.getValueType().getVectorNumElements() == VT.getVectorNumElements();

	// We only have a splat which can skip shuffles if there is a splatted
	// value and no undef lanes rearranged by the shuffle.
	if (Splat && UndefElements.none()) {
	// Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
	// number of elements match or the value splatted is a zero constant.
	if (SameNumElts)
	return N1;
	if (auto *C = dyn_cast<ConstantSDNode>(Splat))
	if (C->isNullValue())
	return N1;
	}

	// If the shuffle itself creates a splat, build the vector directly.
	if (AllSame && SameNumElts) {
	EVT BuildVT = BV->getValueType(0);
	const SDValue &Splatted = BV->getOperand(MaskVec[0]);
	SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);

	// We may have jumped through bitcasts, so the type of the
	// BUILD_VECTOR may not match the type of the shuffle.
	if (BuildVT != VT)
	NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
	return NewBV;
	}
	}
	}

	FoldingSetNodeID ID;
	SDValue Ops[2] = { N1, N2 };
	AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
	for (int i = 0; i != NElts; ++i)
	ID.AddInteger(MaskVec[i]);

	void* IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	// Allocate the mask array for the node out of the BumpPtrAllocator, since
	// SDNode doesn't have access to it. This memory will be "leaked" when
	// the node is deallocated, but recovered when the NodeAllocator is released.
	int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
	std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc);

	auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
	dl.getDebugLoc(), MaskAlloc);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
	MVT VT = SV.getSimpleValueType(0);
	SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
	ShuffleVectorSDNode::commuteMask(MaskVec);

	SDValue Op0 = SV.getOperand(0);
	SDValue Op1 = SV.getOperand(1);
	return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
	}

	SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
	ID.AddInteger(RegNo);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
	ID.AddPointer(RegMask);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
	MCSymbol *Label) {
	return getLabelNode(ISD::EH_LABEL, dl, Root, Label);
	}

	SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
	SDValue Root, MCSymbol *Label) {
	FoldingSetNodeID ID;
	SDValue Ops[] = { Root };
	AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), Ops);
	ID.AddPointer(Label);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<LabelSDNode>(dl.getIROrder(), dl.getDebugLoc(), Label);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddPointer(BA);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getSrcValue(const Value *V) {
	assert((!V \|\| V->getType()->isPointerTy()) &&
	"SrcValue is not a pointer?");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
	ID.AddPointer(V);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<SrcValueSDNode>(V);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMDNode(const MDNode *MD) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
	ID.AddPointer(MD);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<MDNodeSDNode>(MD);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
	if (VT == V.getValueType())
	return V;

	return getNode(ISD::BITCAST, SDLoc(V), VT, V);
	}

	SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
	unsigned SrcAS, unsigned DestAS) {
	SDValue Ops[] = {Ptr};
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
	ID.AddInteger(SrcAS);
	ID.AddInteger(DestAS);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VT, SrcAS, DestAS);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	/// getShiftAmountOperand - Return the specified value casted to
	/// the target's desired shift amount type.
	SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
	EVT OpTy = Op.getValueType();
	EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
	if (OpTy == ShTy \|\| OpTy.isVector()) return Op;

	return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
	}

	SDValue SelectionDAG::expandVAArg(SDNode *Node) {
	SDLoc dl(Node);
	const TargetLowering &TLI = getTargetLoweringInfo();
	const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	EVT VT = Node->getValueType(0);
	SDValue Tmp1 = Node->getOperand(0);
	SDValue Tmp2 = Node->getOperand(1);
	unsigned Align = Node->getConstantOperandVal(3);

	SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
	Tmp2, MachinePointerInfo(V));
	SDValue VAList = VAListLoad;

	if (Align > TLI.getMinStackArgumentAlignment()) {
	assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");

	VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
	getConstant(Align - 1, dl, VAList.getValueType()));

	VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
	getConstant(-(int64_t)Align, dl, VAList.getValueType()));
	}

	// Increment the pointer, VAList, to the next vaarg
	Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
	getConstant(getDataLayout().getTypeAllocSize(
	VT.getTypeForEVT(*getContext())),
	dl, VAList.getValueType()));
	// Store the incremented VAList to the legalized pointer
	Tmp1 =
	getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
	// Load the actual argument out of the pointer VAList
	return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
	}

	SDValue SelectionDAG::expandVACopy(SDNode *Node) {
	SDLoc dl(Node);
	const TargetLowering &TLI = getTargetLoweringInfo();
	// This defaults to loading a pointer from the input and storing it to the
	// output, returning the chain.
	const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
	const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
	SDValue Tmp1 =
	getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
	Node->getOperand(2), MachinePointerInfo(VS));
	return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
	MachinePointerInfo(VD));
	}

	SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
	MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	unsigned ByteSize = VT.getStoreSize();
	Type Ty = VT.getTypeForEVT(getContext());
	unsigned StackAlign =
	std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);

	int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
	return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
	}

	SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
	unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
	Type Ty1 = VT1.getTypeForEVT(getContext());
	Type Ty2 = VT2.getTypeForEVT(getContext());
	const DataLayout &DL = getDataLayout();
	unsigned Align =
	std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));

	MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
	return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
	}

	SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
	ISD::CondCode Cond, const SDLoc &dl) {
	// These setcc operations always fold.
	switch (Cond) {
	default: break;
	case ISD::SETFALSE:
	case ISD::SETFALSE2: return getConstant(0, dl, VT);
	case ISD::SETTRUE:
	case ISD::SETTRUE2: {
	TargetLowering::BooleanContent Cnt =
	TLI->getBooleanContents(N1->getValueType(0));
	return getConstant(
	Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl,
	VT);
	}

	case ISD::SETOEQ:
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETONE:
	case ISD::SETO:
	case ISD::SETUO:
	case ISD::SETUEQ:
	case ISD::SETUNE:
	assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
	break;
	}

	if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
	const APInt &C2 = N2C->getAPIntValue();
	if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
	const APInt &C1 = N1C->getAPIntValue();

	switch (Cond) {
	default: llvm_unreachable("Unknown integer setcc!");
	case ISD::SETEQ: return getConstant(C1 == C2, dl, VT);
	case ISD::SETNE: return getConstant(C1 != C2, dl, VT);
	case ISD::SETULT: return getConstant(C1.ult(C2), dl, VT);
	case ISD::SETUGT: return getConstant(C1.ugt(C2), dl, VT);
	case ISD::SETULE: return getConstant(C1.ule(C2), dl, VT);
	case ISD::SETUGE: return getConstant(C1.uge(C2), dl, VT);
	case ISD::SETLT: return getConstant(C1.slt(C2), dl, VT);
	case ISD::SETGT: return getConstant(C1.sgt(C2), dl, VT);
	case ISD::SETLE: return getConstant(C1.sle(C2), dl, VT);
	case ISD::SETGE: return getConstant(C1.sge(C2), dl, VT);
	}
	}
	}
	if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1)) {
	if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2)) {
	APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
	switch (Cond) {
	default: break;
	case ISD::SETEQ: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, dl, VT);
	case ISD::SETNE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpLessThan, dl, VT);
	case ISD::SETLT: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, dl, VT);
	case ISD::SETGT: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, dl, VT);
	case ISD::SETLE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan \|\|
	R==APFloat::cmpEqual, dl, VT);
	case ISD::SETGE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpEqual, dl, VT);
	case ISD::SETO: return getConstant(R!=APFloat::cmpUnordered, dl, VT);
	case ISD::SETUO: return getConstant(R==APFloat::cmpUnordered, dl, VT);
	case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpEqual, dl, VT);
	case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, dl, VT);
	case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpLessThan, dl, VT);
	case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpUnordered, dl, VT);
	case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, dl, VT);
	case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, dl, VT);
	}
	} else {
	// Ensure that the constant occurs on the RHS.
	ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
	MVT CompVT = N1.getValueType().getSimpleVT();
	if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
	return SDValue();

	return getSetCC(dl, VT, N2, N1, SwappedCond);
	}
	}

	// Could not fold it.
	return SDValue();
	}

	/// See if the specified operand can be simplified with the knowledge that only
	/// the bits specified by Mask are used.
	SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &Mask) {
	switch (V.getOpcode()) {
	default:
	break;
	case ISD::Constant: {
	const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
	assert(CV && "Const value should be ConstSDNode.");
	const APInt &CVal = CV->getAPIntValue();
	APInt NewVal = CVal & Mask;
	if (NewVal != CVal)
	return getConstant(NewVal, SDLoc(V), V.getValueType());
	break;
	}
	case ISD::OR:
	case ISD::XOR:
	// If the LHS or RHS don't contribute bits to the or, drop them.
	if (MaskedValueIsZero(V.getOperand(0), Mask))
	return V.getOperand(1);
	if (MaskedValueIsZero(V.getOperand(1), Mask))
	return V.getOperand(0);
	break;
	case ISD::SRL:
	// Only look at single-use SRLs.
	if (!V.getNode()->hasOneUse())
	break;
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
	// See if we can recursively simplify the LHS.
	unsigned Amt = RHSC->getZExtValue();

	// Watch out for shift count overflow though.
	if (Amt >= Mask.getBitWidth())
	break;
	APInt NewMask = Mask << Amt;
	if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
	return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
	V.getOperand(1));
	}
	break;
	case ISD::AND: {
	// X & -1 -> X (ignoring bits which aren't demanded).
	ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
	if (AndVal && Mask.isSubsetOf(AndVal->getAPIntValue()))
	return V.getOperand(0);
	break;
	}
	case ISD::ANY_EXTEND: {
	SDValue Src = V.getOperand(0);
	unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
	// Being conservative here - only peek through if we only demand bits in the
	// non-extended source (even though the extended bits are technically undef).
	if (Mask.getActiveBits() > SrcBitWidth)
	break;
	APInt SrcMask = Mask.trunc(SrcBitWidth);
	if (SDValue DemandedSrc = GetDemandedBits(Src, SrcMask))
	return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
	break;
	}
	}
	return SDValue();
	}

	/// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We
	/// use this predicate to simplify operations downstream.
	bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
	}

	/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
	/// this predicate to simplify operations downstream. Mask is known to be zero
	/// for bits that V cannot have.
	bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
	unsigned Depth) const {
	KnownBits Known;
	computeKnownBits(Op, Known, Depth);
	return Mask.isSubsetOf(Known.Zero);
	}

	/// Helper function that checks to see if a node is a constant or a
	/// build vector of splat constants at least within the demanded elts.
	static ConstantSDNode *isConstOrDemandedConstSplat(SDValue N,
	const APInt &DemandedElts) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
	return CN;
	if (N.getOpcode() != ISD::BUILD_VECTOR)
	return nullptr;
	EVT VT = N.getValueType();
	ConstantSDNode *Cst = nullptr;
	unsigned NumElts = VT.getVectorNumElements();
	assert(DemandedElts.getBitWidth() == NumElts && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(i));
	if (!C \|\| (Cst && Cst->getAPIntValue() != C->getAPIntValue()) \|\|
	C->getValueType(0) != VT.getScalarType())
	return nullptr;
	Cst = C;
	}
	return Cst;
	}

	/// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
	/// is less than the element bit-width of the shift node, return it.
	static const APInt *getValidShiftAmountConstant(SDValue V) {
	if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
	// Shifting more than the bitwidth is not valid.
	const APInt &ShAmt = SA->getAPIntValue();
	if (ShAmt.ult(V.getScalarValueSizeInBits()))
	return &ShAmt;
	}
	return nullptr;
	}

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. For vectors, the known bits are those that are shared by
	/// every vector element.
	void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	computeKnownBits(Op, Known, DemandedElts, Depth);
	}

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. The DemandedElts argument allows us to only collect the known
	/// bits that are shared by the requested vector elements.
	void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
	const APInt &DemandedElts,
	unsigned Depth) const {
	unsigned BitWidth = Op.getScalarValueSizeInBits();

	Known = KnownBits(BitWidth); // Don't know anything.

	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	// We know all of the bits for a constant!
	Known.One = C->getAPIntValue();
	Known.Zero = ~Known.One;
	return;
	}
	if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
	// We know all of the bits for a constant fp!
	Known.One = C->getValueAPF().bitcastToAPInt();
	Known.Zero = ~Known.One;
	return;
	}

	if (Depth == 6)
	return; // Limit search depth.

	KnownBits Known2;
	unsigned NumElts = DemandedElts.getBitWidth();

	if (!DemandedElts)
	return; // No demanded elts, better to assume we don't know anything.

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case ISD::BUILD_VECTOR:
	// Collect the known bits that are shared by every demanded vector element.
	assert(NumElts == Op.getValueType().getVectorNumElements() &&
	"Unexpected vector size");
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	if (!DemandedElts[i])
	continue;

	SDValue SrcOp = Op.getOperand(i);
	computeKnownBits(SrcOp, Known2, Depth + 1);

	// BUILD_VECTOR can implicitly truncate sources, we must handle this.
	if (SrcOp.getValueSizeInBits() != BitWidth) {
	assert(SrcOp.getValueSizeInBits() > BitWidth &&
	"Expected BUILD_VECTOR implicit truncation");
	Known2 = Known2.trunc(BitWidth);
	}

	// Known bits are the values that are shared by every demanded element.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;

	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	}
	break;
	case ISD::VECTOR_SHUFFLE: {
	// Collect the known bits that are shared by every vector element referenced
	// by the shuffle.
	APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
	Known.Zero.setAllBits(); Known.One.setAllBits();
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
	assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;

	int M = SVN->getMaskElt(i);
	if (M < 0) {
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	Known.resetAll();
	DemandedLHS.clearAllBits();
	DemandedRHS.clearAllBits();
	break;
	}

	if ((unsigned)M < NumElts)
	DemandedLHS.setBit((unsigned)M % NumElts);
	else
	DemandedRHS.setBit((unsigned)M % NumElts);
	}
	// Known bits are the values that are shared by every demanded element.
	if (!!DemandedLHS) {
	SDValue LHS = Op.getOperand(0);
	computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	if (!!DemandedRHS) {
	SDValue RHS = Op.getOperand(1);
	computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	// Split DemandedElts and test each of the demanded subvectors.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	EVT SubVectorVT = Op.getOperand(0).getValueType();
	unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
	unsigned NumSubVectors = Op.getNumOperands();
	for (unsigned i = 0; i != NumSubVectors; ++i) {
	APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
	DemandedSub = DemandedSub.trunc(NumSubVectorElts);
	if (!!DemandedSub) {
	SDValue Sub = Op.getOperand(i);
	computeKnownBits(Sub, Known2, DemandedSub, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	}
	break;
	}
	case ISD::INSERT_SUBVECTOR: {
	// If we know the element index, demand any elements from the subvector and
	// the remainder from the src its inserted into, otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
	Known.One.setAllBits();
	Known.Zero.setAllBits();
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
	if (!!DemandedSubElts) {
	computeKnownBits(Sub, Known, DemandedSubElts, Depth + 1);
	if (Known.isUnknown())
	break; // early-out.
	}
	APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
	APInt DemandedSrcElts = DemandedElts & ~SubMask;
	if (!!DemandedSrcElts) {
	computeKnownBits(Src, Known2, DemandedSrcElts, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	} else {
	computeKnownBits(Sub, Known, Depth + 1);
	if (Known.isUnknown())
	break; // early-out.
	computeKnownBits(Src, Known2, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	break;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// If we know the element index, just demand that subvector elements,
	// otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
	computeKnownBits(Src, Known, DemandedSrc, Depth + 1);
	} else {
	computeKnownBits(Src, Known, Depth + 1);
	}
	break;
	}
	case ISD::BITCAST: {
	SDValue N0 = Op.getOperand(0);
	EVT SubVT = N0.getValueType();
	unsigned SubBitWidth = SubVT.getScalarSizeInBits();

	// Ignore bitcasts from unsupported types.
	if (!(SubVT.isInteger() \|\| SubVT.isFloatingPoint()))
	break;

	// Fast handling of 'identity' bitcasts.
	if (BitWidth == SubBitWidth) {
	computeKnownBits(N0, Known, DemandedElts, Depth + 1);
	break;
	}

	// Support big-endian targets when it becomes useful.
	bool IsLE = getDataLayout().isLittleEndian();
	if (!IsLE)
	break;

	// Bitcast 'small element' vector to 'large element' scalar/vector.
	if ((BitWidth % SubBitWidth) == 0) {
	assert(N0.getValueType().isVector() && "Expected bitcast from vector");

	// Collect known bits for the (larger) output by collecting the known
	// bits from each set of sub elements and shift these into place.
	// We need to separately call computeKnownBits for each set of
	// sub elements as the knownbits for each is likely to be different.
	unsigned SubScale = BitWidth / SubBitWidth;
	APInt SubDemandedElts(NumElts * SubScale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SubDemandedElts.setBit(i * SubScale);

	for (unsigned i = 0; i != SubScale; ++i) {
	computeKnownBits(N0, Known2, SubDemandedElts.shl(i),
	Depth + 1);
	Known.One \|= Known2.One.zext(BitWidth).shl(SubBitWidth * i);
	Known.Zero \|= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i);
	}
	}

	// Bitcast 'large element' scalar/vector to 'small element' vector.
	if ((SubBitWidth % BitWidth) == 0) {
	assert(Op.getValueType().isVector() && "Expected bitcast to vector");

	// Collect known bits for the (smaller) output by collecting the known
	// bits from the overlapping larger input elements and extracting the
	// sub sections we actually care about.
	unsigned SubScale = SubBitWidth / BitWidth;
	APInt SubDemandedElts(NumElts / SubScale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SubDemandedElts.setBit(i / SubScale);

	computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1);

	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	unsigned Offset = (i % SubScale) * BitWidth;
	Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
	Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	}
	}
	break;
	}
	case ISD::AND:
	// If either the LHS or the RHS are Zero, the result is zero.
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// Output known-1 bits are only known if set in both the LHS & RHS.
	Known.One &= Known2.One;
	// Output known-0 are known to be clear if zero in either the LHS \| RHS.
	Known.Zero \|= Known2.Zero;
	break;
	case ISD::OR:
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	case ISD::XOR: {
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// Output known-0 bits are known if clear or set in both the LHS & RHS.
	APInt KnownZeroOut = (Known.Zero & Known2.Zero) \| (Known.One & Known2.One);
	// Output known-1 are known to be set if set in only one of the LHS, RHS.
	Known.One = (Known.Zero & Known2.One) \| (Known.One & Known2.Zero);
	Known.Zero = KnownZeroOut;
	break;
	}
	case ISD::MUL: {
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// If low bits are zero in either operand, output low known-0 bits.
	// Also compute a conservative estimate for high known-0 bits.
	// More trickiness is possible, but this is sufficient for the
	// interesting case of alignment computation.
	unsigned TrailZ = Known.countMinTrailingZeros() +
	Known2.countMinTrailingZeros();
	unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
	Known2.countMinLeadingZeros(),
	BitWidth) - BitWidth;

	Known.resetAll();
	Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
	Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
	break;
	}
	case ISD::UDIV: {
	// For the purposes of computing leading zeros we can conservatively
	// treat a udiv as a logical right shift by the power of 2 known to
	// be less than the denominator.
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	unsigned LeadZ = Known2.countMinLeadingZeros();

	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
	unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
	if (RHSMaxLeadingZeros != BitWidth)
	LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);

	Known.Zero.setHighBits(LeadZ);
	break;
	}
	case ISD::SELECT:
	case ISD::VSELECT:
	computeKnownBits(Op.getOperand(2), Known, DemandedElts, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SELECT_CC:
	computeKnownBits(Op.getOperand(3), Known, DemandedElts, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	computeKnownBits(Op.getOperand(2), Known2, DemandedElts, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SMULO:
	case ISD::UMULO:
	if (Op.getResNo() != 1)
	break;
	// The boolean result conforms to getBooleanContents.
	// If we know the result of a setcc has the top bits zero, use this info.
	// We know that we have an integer-based boolean since these operations
	// are only available for integer.
	if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::SETCC:
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::SHL:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	unsigned Shift = ShAmt->getZExtValue();
	Known.Zero <<= Shift;
	Known.One <<= Shift;
	// Low bits are known zero.
	Known.Zero.setLowBits(Shift);
	}
	break;
	case ISD::SRL:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	unsigned Shift = ShAmt->getZExtValue();
	Known.Zero.lshrInPlace(Shift);
	Known.One.lshrInPlace(Shift);
	// High bits are known zero.
	Known.Zero.setHighBits(Shift);
	} else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
	// If the shift amount is a vector of constants see if we can bound
	// the number of upper zero bits.
	unsigned ShiftAmountMin = BitWidth;
	for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
	if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
	const APInt &ShAmt = C->getAPIntValue();
	if (ShAmt.ult(BitWidth)) {
	ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
	ShAmt.getZExtValue());
	continue;
	}
	}
	// Don't know anything.
	ShiftAmountMin = 0;
	break;
	}

	Known.Zero.setHighBits(ShiftAmountMin);
	}
	break;
	case ISD::SRA:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	unsigned Shift = ShAmt->getZExtValue();
	// Sign extend known zero/one bit (else is unknown).
	Known.Zero.ashrInPlace(Shift);
	Known.One.ashrInPlace(Shift);
	}
	break;
	case ISD::SIGN_EXTEND_INREG: {
	EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	unsigned EBits = EVT.getScalarSizeInBits();

	// Sign extension. Compute the demanded bits in the result that are not
	// present in the input.
	APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);

	APInt InSignMask = APInt::getSignMask(EBits);
	APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);

	// If the sign extended bits are demanded, we know that the sign
	// bit is demanded.
	InSignMask = InSignMask.zext(BitWidth);
	if (NewBits.getBoolValue())
	InputDemandedBits \|= InSignMask;

	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known.One &= InputDemandedBits;
	Known.Zero &= InputDemandedBits;

	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.
	if (Known.Zero.intersects(InSignMask)) { // Input sign bit known clear
	Known.Zero \|= NewBits;
	Known.One &= ~NewBits;
	} else if (Known.One.intersects(InSignMask)) { // Input sign bit known set
	Known.One \|= NewBits;
	Known.Zero &= ~NewBits;
	} else { // Input sign bit unknown
	Known.Zero &= ~NewBits;
	Known.One &= ~NewBits;
	}
	break;
	}
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleTZ = Known2.countMaxTrailingZeros();
	unsigned LowBits = Log2_32(PossibleTZ) + 1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleLZ = Known2.countMaxLeadingZeros();
	unsigned LowBits = Log2_32(PossibleLZ) + 1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case ISD::CTPOP: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	// If we know some of the bits are zero, they can't be one.
	unsigned PossibleOnes = Known2.countMaxPopulation();
	Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
	break;
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	// If this is a ZEXTLoad and we are looking at the loaded value.
	if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
	EVT VT = LD->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero.setBitsFrom(MemBits);
	} else if (const MDNode *Ranges = LD->getRanges()) {
	if (LD->getExtensionType() == ISD::NON_EXTLOAD)
	computeKnownBitsFromRangeMetadata(*Ranges, Known);
	}
	break;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG: {
	EVT InVT = Op.getOperand(0).getValueType();
	APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
	computeKnownBits(Op.getOperand(0), Known, InDemandedElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
	break;
	}
	case ISD::ZERO_EXTEND: {
	EVT InVT = Op.getOperand(0).getValueType();
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
	break;
	}
	// TODO ISD::SIGN_EXTEND_VECTOR_INREG
	case ISD::SIGN_EXTEND: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	// If the sign bit is known to be zero or one, then sext will extend
	// it to the top bits, else it will just zext.
	Known = Known.sext(BitWidth);
	break;
	}
	case ISD::ANY_EXTEND: {
	computeKnownBits(Op.getOperand(0), Known, Depth+1);
	Known = Known.zext(BitWidth);
	break;
	}
	case ISD::TRUNCATE: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known = Known.trunc(BitWidth);
	break;
	}
	case ISD::AssertZext: {
	EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
	computeKnownBits(Op.getOperand(0), Known, Depth+1);
	Known.Zero \|= (~InMask);
	Known.One &= (~Known.Zero);
	break;
	}
	case ISD::FGETSIGN:
	// All bits are zero except the low bit.
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::USUBO:
	case ISD::SSUBO:
	if (Op.getResNo() == 1) {
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::SUBC: {
	if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
	// We know that the top bits of C-X are clear if X contains less bits
	// than C (i.e. no wrap-around can happen). For example, 20-X is
	// positive if we can prove that X is >= 0 and < 16.
	if (CLHS->getAPIntValue().isNonNegative()) {
	unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
	// NLZ can't be BitWidth with no sign bit
	APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
	Depth + 1);

	// If all of the MaskV bits are known to be zero, then we know the
	// output top bits are zero, because we now know that the output is
	// from [0-C].
	if ((Known2.Zero & MaskV) == MaskV) {
	unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
	// Top bits known zero.
	Known.Zero.setHighBits(NLZ2);
	}
	}
	}

	// If low bits are know to be zero in both operands, then we know they are
	// going to be 0 in the result. Both addition and complement operations
	// preserve the low zero bits.
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	unsigned KnownZeroLow = Known2.countMinTrailingZeros();
	if (KnownZeroLow == 0)
	break;

	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
	KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
	Known.Zero.setLowBits(KnownZeroLow);
	break;
	}
	case ISD::UADDO:
	case ISD::SADDO:
	case ISD::ADDCARRY:
	if (Op.getResNo() == 1) {
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::ADDC:
	case ISD::ADDE: {
	// Output known-0 bits are known if clear or set in both the low clear bits
	// common to both LHS & RHS. For example, 8+(X<<3) is known to have the
	// low 3 bits clear.
	// Output known-0 bits are also known if the top bits of each input are
	// known to be clear. For example, if one input has the top 10 bits clear
	// and the other has the top 8 bits clear, we know the top 7 bits of the
	// output must be clear.
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
	unsigned KnownZeroLow = Known2.countMinTrailingZeros();

	computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
	Depth + 1);
	KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
	KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());

	if (Opcode == ISD::ADDE \|\| Opcode == ISD::ADDCARRY) {
	// With ADDE and ADDCARRY, a carry bit may be added in, so we can only
	// use this information if we know (at least) that the low two bits are
	// clear. We then return to the caller that the low bit is unknown but
	// that other bits are known zero.
	if (KnownZeroLow >= 2)
	Known.Zero.setBits(1, KnownZeroLow);
	break;
	}

	Known.Zero.setLowBits(KnownZeroLow);
	if (KnownZeroHigh > 1)
	Known.Zero.setHighBits(KnownZeroHigh - 1);
	break;
	}
	case ISD::SREM:
	if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
	const APInt &RA = Rem->getAPIntValue().abs();
	if (RA.isPowerOf2()) {
	APInt LowBits = RA - 1;
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// The low bits of the first operand are unchanged by the srem.
	Known.Zero = Known2.Zero & LowBits;
	Known.One = Known2.One & LowBits;

	// If the first operand is non-negative or has all low bits zero, then
	// the upper bits are all zero.
	if (Known2.Zero[BitWidth-1] \|\| ((Known2.Zero & LowBits) == LowBits))
	Known.Zero \|= ~LowBits;

	// If the first operand is negative and not all low bits are zero, then
	// the upper bits are all one.
	if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
	Known.One \|= ~LowBits;
	assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
	}
	}
	break;
	case ISD::UREM: {
	if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
	const APInt &RA = Rem->getAPIntValue();
	if (RA.isPowerOf2()) {
	APInt LowBits = (RA - 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// The upper bits are all zero, the lower ones are unchanged.
	Known.Zero = Known2.Zero \| ~LowBits;
	Known.One = Known2.One & LowBits;
	break;
	}
	}

	// Since the result is less than or equal to either operand, any leading
	// zero bits in either operand must also exist in the result.
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);

	uint32_t Leaders =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	Known.resetAll();
	Known.Zero.setHighBits(Leaders);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	computeKnownBits(Op.getOperand(0), Known, Depth+1);
	const unsigned Index = Op.getConstantOperandVal(1);
	const unsigned BitWidth = Op.getValueSizeInBits();

	// Remove low part of known bits mask
	Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth);
	Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth);

	// Remove high part of known bit mask
	Known = Known.trunc(BitWidth);
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue EltNo = Op.getOperand(1);
	EVT VecVT = InVec.getValueType();
	const unsigned BitWidth = Op.getValueSizeInBits();
	const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
	const unsigned NumSrcElts = VecVT.getVectorNumElements();
	// If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
	// anything about the extended bits.
	if (BitWidth > EltBitWidth)
	Known = Known.trunc(EltBitWidth);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
	// If we know the element index, just demand that vector element.
	unsigned Idx = ConstEltNo->getZExtValue();
	APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
	computeKnownBits(InVec, Known, DemandedElt, Depth + 1);
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	computeKnownBits(InVec, Known, Depth + 1);
	}
	if (BitWidth > EltBitWidth)
	Known = Known.zext(BitWidth);
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue InVal = Op.getOperand(1);
	SDValue EltNo = Op.getOperand(2);

	ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
	// If we know the element index, split the demand between the
	// source vector and the inserted element.
	Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
	unsigned EltIdx = CEltNo->getZExtValue();

	// If we demand the inserted element then add its common known bits.
	if (DemandedElts[EltIdx]) {
	computeKnownBits(InVal, Known2, Depth + 1);
	Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
	Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
	}

	// If we demand the source vector then add its common known bits, ensuring
	// that we don't demand the inserted element.
	APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
	if (!!VectorElts) {
	computeKnownBits(InVec, Known2, VectorElts, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	computeKnownBits(InVec, Known, Depth + 1);
	computeKnownBits(InVal, Known2, Depth + 1);
	Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
	Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
	}
	break;
	}
	case ISD::BITREVERSE: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	Known.Zero = Known2.Zero.reverseBits();
	Known.One = Known2.One.reverseBits();
	break;
	}
	case ISD::BSWAP: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	Known.Zero = Known2.Zero.byteSwap();
	Known.One = Known2.One.byteSwap();
	break;
	}
	case ISD::ABS: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// If the source's MSB is zero then we know the rest of the bits already.
	if (Known2.isNonNegative()) {
	Known.Zero = Known2.Zero;
	Known.One = Known2.One;
	break;
	}

	// We only know that the absolute values's MSB will be zero iff there is
	// a set bit that isn't the sign bit (otherwise it could be INT_MIN).
	Known2.One.clearSignBit();
	if (Known2.One.getBoolValue()) {
	Known.Zero = APInt::getSignMask(BitWidth);
	break;
	}
	break;
	}
	case ISD::UMIN: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);

	// UMIN - we know that the result will have the maximum of the
	// known zero leading bits of the inputs.
	unsigned LeadZero = Known.countMinLeadingZeros();
	LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());

	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	Known.Zero.setHighBits(LeadZero);
	break;
	}
	case ISD::UMAX: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts,
	Depth + 1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);

	// UMAX - we know that the result will have the maximum of the
	// known one leading bits of the inputs.
	unsigned LeadOne = Known.countMinLeadingOnes();
	LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());

	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	Known.One.setHighBits(LeadOne);
	break;
	}
	case ISD::SMIN:
	case ISD::SMAX: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts,
	Depth + 1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case ISD::FrameIndex:
	case ISD::TargetFrameIndex:
	TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
	break;

	default:
	if (Opcode < ISD::BUILTIN_OP_END)
	break;
	LLVM_FALLTHROUGH;
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_VOID:
	// Allow the target to implement this method for its nodes.
	TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
	break;
	}

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	}

	SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
	SDValue N1) const {
	// X + 0 never overflow
	if (isNullConstant(N1))
	return OFK_Never;

	KnownBits N1Known;
	computeKnownBits(N1, N1Known);
	if (N1Known.Zero.getBoolValue()) {
	KnownBits N0Known;
	computeKnownBits(N0, N0Known);

	bool overflow;
	(void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
	if (!overflow)
	return OFK_Never;
	}

	// mulhi + 1 never overflow
	if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
	(~N1Known.Zero & 0x01) == ~N1Known.Zero)
	return OFK_Never;

	if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
	KnownBits N0Known;
	computeKnownBits(N0, N0Known);

	if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
	return OFK_Never;
	}

	return OFK_Sometime;
	}

	bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
	EVT OpVT = Val.getValueType();
	unsigned BitWidth = OpVT.getScalarSizeInBits();

	// Is the constant a known power of 2?
	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
	return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();

	// A left-shift of a constant one will have exactly one bit set because
	// shifting the bit off the end is undefined.
	if (Val.getOpcode() == ISD::SHL) {
	auto *C = isConstOrConstSplat(Val.getOperand(0));
	if (C && C->getAPIntValue() == 1)
	return true;
	}

	// Similarly, a logical right-shift of a constant sign-bit will have exactly
	// one bit set.
	if (Val.getOpcode() == ISD::SRL) {
	auto *C = isConstOrConstSplat(Val.getOperand(0));
	if (C && C->getAPIntValue().isSignMask())
	return true;
	}

	// Are all operands of a build vector constant powers of two?
	if (Val.getOpcode() == ISD::BUILD_VECTOR)
	if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
	return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
	return false;
	}))
	return true;

	// More could be done here, though the above checks are enough
	// to handle some common cases.

	// Fall back to computeKnownBits to catch other known cases.
	KnownBits Known;
	computeKnownBits(Val, Known);
	return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
	}

	unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return ComputeNumSignBits(Op, DemandedElts, Depth);
	}

	unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	assert((VT.isInteger() \|\| VT.isFloatingPoint()) && "Invalid VT!");
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned NumElts = DemandedElts.getBitWidth();
	unsigned Tmp, Tmp2;
	unsigned FirstAnswer = 1;

	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	const APInt &Val = C->getAPIntValue();
	return Val.getNumSignBits();
	}

	if (Depth == 6)
	return 1; // Limit search depth.

	if (!DemandedElts)
	return 1; // No demanded elts, better to assume we don't know anything.

	switch (Op.getOpcode()) {
	default: break;
	case ISD::AssertSext:
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
	return VTBits-Tmp+1;
	case ISD::AssertZext:
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
	return VTBits-Tmp;

	case ISD::BUILD_VECTOR:
	Tmp = VTBits;
	for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
	if (!DemandedElts[i])
	continue;

	SDValue SrcOp = Op.getOperand(i);
	Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);

	// BUILD_VECTOR can implicitly truncate sources, we must handle this.
	if (SrcOp.getValueSizeInBits() != VTBits) {
	assert(SrcOp.getValueSizeInBits() > VTBits &&
	"Expected BUILD_VECTOR implicit truncation");
	unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
	Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
	}
	Tmp = std::min(Tmp, Tmp2);
	}
	return Tmp;

	case ISD::VECTOR_SHUFFLE: {
	// Collect the minimum number of sign bits that are shared by every vector
	// element referenced by the shuffle.
	APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
	assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	int M = SVN->getMaskElt(i);
	if (!DemandedElts[i])
	continue;
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	if (M < 0)
	return 1;
	if ((unsigned)M < NumElts)
	DemandedLHS.setBit((unsigned)M % NumElts);
	else
	DemandedRHS.setBit((unsigned)M % NumElts);
	}
	Tmp = std::numeric_limits<unsigned>::max();
	if (!!DemandedLHS)
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS) {
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	// If we don't know anything, early out and try computeKnownBits fall-back.
	if (Tmp == 1)
	break;
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}

	case ISD::BITCAST: {
	SDValue N0 = Op.getOperand(0);
	EVT SrcVT = N0.getValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();

	// Ignore bitcasts from unsupported types..
	if (!(SrcVT.isInteger() \|\| SrcVT.isFloatingPoint()))
	break;

	// Fast handling of 'identity' bitcasts.
	if (VTBits == SrcBits)
	return ComputeNumSignBits(N0, DemandedElts, Depth + 1);

	// Bitcast 'large element' scalar/vector to 'small element' vector.
	// TODO: Handle cases other than 'sign splat' when we have a use case.
	// Requires handling of DemandedElts and Endianness.
	if ((SrcBits % VTBits) == 0) {
	assert(Op.getValueType().isVector() && "Expected bitcast to vector");
	Tmp = ComputeNumSignBits(N0, Depth + 1);
	if (Tmp == SrcBits)
	return VTBits;
	}
	break;
	}

	case ISD::SIGN_EXTEND:
	Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
	return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
	case ISD::SIGN_EXTEND_INREG:
	// Max of the input and what this extends.
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
	Tmp = VTBits-Tmp+1;
	Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	return std::max(Tmp, Tmp2);
	case ISD::SIGN_EXTEND_VECTOR_INREG: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedSrcElts = DemandedElts.zext(SrcVT.getVectorNumElements());
	Tmp = VTBits - SrcVT.getScalarSizeInBits();
	return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
	}

	case ISD::SRA:
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	// SRA X, C -> adds C sign bits.
	if (ConstantSDNode *C =
	isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)) {
	APInt ShiftVal = C->getAPIntValue();
	ShiftVal += Tmp;
	Tmp = ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}
	return Tmp;
	case ISD::SHL:
	if (ConstantSDNode *C =
	isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)) {
	// shl destroys sign bits.
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	if (C->getAPIntValue().uge(VTBits) \|\| // Bad shift.
	C->getAPIntValue().uge(Tmp)) break; // Shifted all sign bits out.
	return Tmp - C->getZExtValue();
	}
	break;
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: // NOT is handled here.
	// Logical binary ops preserve the number of sign bits at the worst.
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	if (Tmp != 1) {
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
	FirstAnswer = std::min(Tmp, Tmp2);
	// We computed what we know about the sign bits as our first
	// answer. Now proceed to the generic code that uses
	// computeKnownBits, and pick whichever answer is better.
	}
	break;

	case ISD::SELECT:
	case ISD::VSELECT:
	Tmp = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
	return std::min(Tmp, Tmp2);
	case ISD::SELECT_CC:
	Tmp = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth+1);
	return std::min(Tmp, Tmp2);

	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (Tmp == 1)
	return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	return std::min(Tmp, Tmp2);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	if (Op.getResNo() != 1)
	break;
	// The boolean result conforms to getBooleanContents. Fall through.
	// If setcc returns 0/-1, all bits are sign bits.
	// We know that we have an integer-based boolean since these operations
	// are only available for integer.
	if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent)
	return VTBits;
	break;
	case ISD::SETCC:
	// If setcc returns 0/-1, all bits are sign bits.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent)
	return VTBits;
	break;
	case ISD::ROTL:
	case ISD::ROTR:
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	unsigned RotAmt = C->getAPIntValue().urem(VTBits);

	// Handle rotate right by N like a rotate left by 32-N.
	if (Op.getOpcode() == ISD::ROTR)
	RotAmt = (VTBits - RotAmt) % VTBits;

	// If we aren't rotating out all of the known-in sign bits, return the
	// number that are left. This handles rotl(sext(x), 1) for example.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
	}
	break;
	case ISD::ADD:
	case ISD::ADDC:
	// Add can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp == 1) return 1; // Early out.

	// Special case decrementing a value (ADD X, -1):
	if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	if (CRHS->isAllOnesValue()) {
	KnownBits Known;
	computeKnownBits(Op.getOperand(0), Known, Depth+1);

	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return VTBits;

	// If we are subtracting one from a positive number, there is no carry
	// out of the result.
	if (Known.isNonNegative())
	return Tmp;
	}

	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp2 == 1) return 1;
	return std::min(Tmp, Tmp2)-1;

	case ISD::SUB:
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp2 == 1) return 1;

	// Handle NEG.
	if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
	if (CLHS->isNullValue()) {
	KnownBits Known;
	computeKnownBits(Op.getOperand(1), Known, Depth+1);
	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return VTBits;

	// If the input is known to be positive (the sign bit is known clear),
	// the output of the NEG has the same number of sign bits as the input.
	if (Known.isNonNegative())
	return Tmp2;

	// Otherwise, we treat this like a SUB.
	}

	// Sub can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp == 1) return 1; // Early out.
	return std::min(Tmp, Tmp2)-1;
	case ISD::TRUNCATE: {
	// Check if the sign bits of source go down as far as the truncated value.
	unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (NumSrcSignBits > (NumSrcBits - VTBits))
	return NumSrcSignBits - (NumSrcBits - VTBits);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	const int BitWidth = Op.getValueSizeInBits();
	const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;

	// Get reverse index (starting from 1), Op1 value indexes elements from
	// little end. Sign starts at big end.
	const int rIndex = Items - 1 - Op.getConstantOperandVal(1);

	// If the sign portion ends in our element the subtraction gives correct
	// result. Otherwise it gives either negative or > bitwidth result
	return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue InVal = Op.getOperand(1);
	SDValue EltNo = Op.getOperand(2);
	unsigned NumElts = InVec.getValueType().getVectorNumElements();

	ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
	// If we know the element index, split the demand between the
	// source vector and the inserted element.
	unsigned EltIdx = CEltNo->getZExtValue();

	// If we demand the inserted element then get its sign bits.
	Tmp = std::numeric_limits<unsigned>::max();
	if (DemandedElts[EltIdx]) {
	// TODO - handle implicit truncation of inserted elements.
	if (InVal.getScalarValueSizeInBits() != VTBits)
	break;
	Tmp = ComputeNumSignBits(InVal, Depth + 1);
	}

	// If we demand the source vector then get its sign bits, and determine
	// the minimum.
	APInt VectorElts = DemandedElts;
	VectorElts.clearBit(EltIdx);
	if (!!VectorElts) {
	Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	Tmp = ComputeNumSignBits(InVec, Depth + 1);
	Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue EltNo = Op.getOperand(1);
	EVT VecVT = InVec.getValueType();
	const unsigned BitWidth = Op.getValueSizeInBits();
	const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
	const unsigned NumSrcElts = VecVT.getVectorNumElements();

	// If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
	// anything about sign bits. But if the sizes match we can derive knowledge
	// about sign bits from the vector operand.
	if (BitWidth != EltBitWidth)
	break;

	// If we know the element index, just demand that vector element, else for
	// an unknown element index, ignore DemandedElts and demand them all.
	APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
	DemandedSrcElts =
	APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());

	return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// If we know the element index, just demand that subvector elements,
	// otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
	return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
	}
	return ComputeNumSignBits(Src, Depth + 1);
	}
	case ISD::CONCAT_VECTORS:
	// Determine the minimum number of sign bits across all demanded
	// elts of the input vectors. Early out if the result is already 1.
	Tmp = std::numeric_limits<unsigned>::max();
	EVT SubVectorVT = Op.getOperand(0).getValueType();
	unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
	unsigned NumSubVectors = Op.getNumOperands();
	for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
	APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
	DemandedSub = DemandedSub.trunc(NumSubVectorElts);
	if (!DemandedSub)
	continue;
	Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}

	// If we are looking at the loaded value of the SDNode.
	if (Op.getResNo() == 0) {
	// Handle LOADX separately here. EXTLOAD case will fallthrough.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
	unsigned ExtType = LD->getExtensionType();
	switch (ExtType) {
	default: break;
	case ISD::SEXTLOAD: // '17' bits known
	Tmp = LD->getMemoryVT().getScalarSizeInBits();
	return VTBits-Tmp+1;
	case ISD::ZEXTLOAD: // '16' bits known
	Tmp = LD->getMemoryVT().getScalarSizeInBits();
	return VTBits-Tmp;
	}
	}
	}

	// Allow the target to implement this method for its nodes.
	if (Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) {
	unsigned NumBits =
	TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
	if (NumBits > 1)
	FirstAnswer = std::max(FirstAnswer, NumBits);
	}

	// Finally, if we can prove that the top bits of the result are 0's or 1's,
	// use this information.
	KnownBits Known;
	computeKnownBits(Op, Known, DemandedElts, Depth);

	APInt Mask;
	if (Known.isNonNegative()) { // sign bit is 0
	Mask = Known.Zero;
	} else if (Known.isNegative()) { // sign bit is 1;
	Mask = Known.One;
	} else {
	// Nothing known.
	return FirstAnswer;
	}

	// Okay, we know that the sign bit in Mask is set. Use CLZ to determine
	// the number of identical bits in the top of the input value.
	Mask = ~Mask;
	Mask <<= Mask.getBitWidth()-VTBits;
	// Return # leading zeros. We use 'min' here in case Val was zero before
	// shifting. We don't want to return '64' as for an i32 "0".
	return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
	}

	bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
	if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)))
	return false;

	if (Op.getOpcode() == ISD::OR &&
	!MaskedValueIsZero(Op.getOperand(0),
	cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue()))
	return false;

	return true;
	}

	bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
	// If we're told that NaNs won't happen, assume they won't.
	if (getTarget().Options.NoNaNsFPMath)
	return true;

	if (Op->getFlags().hasNoNaNs())
	return true;

	// If the value is a constant, we can obviously see if it is a NaN or not.
	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
	return !C->getValueAPF().isNaN();

	// TODO: Recognize more cases here.

	return false;
	}

	bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
	// If the value is a constant, we can obviously see if it is a zero or not.
	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
	return !C->isZero();

	// TODO: Recognize more cases here.
	switch (Op.getOpcode()) {
	default: break;
	case ISD::OR:
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return !C->isNullValue();
	break;
	}

	return false;
	}

	bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
	// Check the obvious case.
	if (A == B) return true;

	// For for negative and positive zero.
	if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
	if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
	if (CA->isZero() && CB->isZero()) return true;

	// Otherwise they may not be equal.
	return false;
	}

	bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
	assert(A.getValueType() == B.getValueType() &&
	"Values must have the same type");
	KnownBits AKnown, BKnown;
	computeKnownBits(A, AKnown);
	computeKnownBits(B, BKnown);
	return (AKnown.Zero \| BKnown.Zero).isAllOnesValue();
	}

	static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	SelectionDAG &DAG) {
	assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
	assert(llvm::all_of(Ops,
	[Ops](SDValue Op) {
	return Ops[0].getValueType() == Op.getValueType();
	}) &&
	"Concatenation of vectors with inconsistent value types!");
	assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
	VT.getVectorNumElements() &&
	"Incorrect element count in vector concatenation!");

	if (Ops.size() == 1)
	return Ops[0];

	// Concat of UNDEFs is UNDEF.
	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	// A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
	// simplified to one big BUILD_VECTOR.
	// FIXME: Add support for SCALAR_TO_VECTOR as well.
	EVT SVT = VT.getScalarType();
	SmallVector<SDValue, 16> Elts;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	if (Op.isUndef())
	Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
	else if (Op.getOpcode() == ISD::BUILD_VECTOR)
	Elts.append(Op->op_begin(), Op->op_end());
	else
	return SDValue();
	}

	// BUILD_VECTOR requires all inputs to be of the same type, find the
	// maximum type and extend them all.
	for (SDValue Op : Elts)
	SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);

	if (SVT.bitsGT(VT.getScalarType()))
	for (SDValue &Op : Elts)
	Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
	? DAG.getZExtOrTrunc(Op, DL, SVT)
	: DAG.getSExtOrTrunc(Op, DL, SVT);

	SDValue V = DAG.getBuildVector(VT, DL, Elts);
	NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
	return V;
	}

	/// Gets or creates the specified node.
	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, getVTList(VT), None);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
	getVTList(VT));
	CSEMap.InsertNode(N, IP);

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue Operand, const SDNodeFlags Flags) {
	// Constant fold unary operations with an integer constant operand. Even
	// opaque constant will be folded, because the folding of unary operations
	// doesn't create new constants with different values. Nevertheless, the
	// opaque flag is preserved during folding to prevent future folding with
	// other constants.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
	const APInt &Val = C->getAPIntValue();
	switch (Opcode) {
	default: break;
	case ISD::SIGN_EXTEND:
	return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
	C->isTargetOpcode(), C->isOpaque());
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::TRUNCATE:
	return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
	C->isTargetOpcode(), C->isOpaque());
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP: {
	APFloat apf(EVTToAPFloatSemantics(VT),
	APInt::getNullValue(VT.getSizeInBits()));
	(void)apf.convertFromAPInt(Val,
	Opcode==ISD::SINT_TO_FP,
	APFloat::rmNearestTiesToEven);
	return getConstantFP(apf, DL, VT);
	}
	case ISD::BITCAST:
	if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
	return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
	if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
	return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
	if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
	return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
	if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
	return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
	break;
	case ISD::ABS:
	return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::BITREVERSE:
	return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::BSWAP:
	return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTPOP:
	return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::FP16_TO_FP: {
	bool Ignored;
	APFloat FPV(APFloat::IEEEhalf(),
	(Val.getBitWidth() == 16) ? Val : Val.trunc(16));

	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)FPV.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &Ignored);
	return getConstantFP(FPV, DL, VT);
	}
	}
	}

	// Constant fold unary operations with a floating point constant operand.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
	APFloat V = C->getValueAPF(); // make copy
	switch (Opcode) {
	case ISD::FNEG:
	V.changeSign();
	return getConstantFP(V, DL, VT);
	case ISD::FABS:
	V.clearSign();
	return getConstantFP(V, DL, VT);
	case ISD::FCEIL: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FTRUNC: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FFLOOR: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FP_EXTEND: {
	bool ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &ignored);
	return getConstantFP(V, DL, VT);
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool ignored;
	APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
	// FIXME need to be more flexible about rounding mode.
	APFloat::opStatus s =
	V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
	if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
	break;
	return getConstant(IntVal, DL, VT);
	}
	case ISD::BITCAST:
	if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
	return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
	else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
	return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
	else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
	return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
	break;
	case ISD::FP_TO_FP16: {
	bool Ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(APFloat::IEEEhalf(),
	APFloat::rmNearestTiesToEven, &Ignored);
	return getConstant(V.bitcastToAPInt(), DL, VT);
	}
	}
	}

	// Constant fold unary operations with a vector integer or float operand.
	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
	if (BV->isConstant()) {
	switch (Opcode) {
	default:
	// FIXME: Entirely reasonable to perform folding of other unary
	// operations here as the need arises.
	break;
	case ISD::FNEG:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FFLOOR:
	case ISD::FP_EXTEND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::TRUNCATE:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP:
	case ISD::ABS:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTPOP: {
	SDValue Ops = { Operand };
	if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
	return Fold;
	}
	}
	}
	}

	unsigned OpOpcode = Operand.getNode()->getOpcode();
	switch (Opcode) {
	case ISD::TokenFactor:
	case ISD::MERGE_VALUES:
	case ISD::CONCAT_VECTORS:
	return Operand; // Factor, merge or concat of one node? No need.
	case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
	case ISD::FP_EXTEND:
	assert(VT.isFloatingPoint() &&
	Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
	if (Operand.getValueType() == VT) return Operand; // noop conversion.
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid fpext node, dst < src!");
	if (Operand.isUndef())
	return getUNDEF(VT);
	break;
	case ISD::SIGN_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid SIGN_EXTEND!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid sext node, dst < src!");
	if (OpOpcode == ISD::SIGN_EXTEND \|\| OpOpcode == ISD::ZERO_EXTEND)
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	// sext(undef) = 0, because the top bits will all be the same.
	return getConstant(0, DL, VT);
	break;
	case ISD::ZERO_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid ZERO_EXTEND!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid zext node, dst < src!");
	if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
	return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	// zext(undef) = 0, because the top bits will be zero.
	return getConstant(0, DL, VT);
	break;
	case ISD::ANY_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid ANY_EXTEND!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid anyext node, dst < src!");

	if (OpOpcode == ISD::ZERO_EXTEND \|\| OpOpcode == ISD::SIGN_EXTEND \|\|
	OpOpcode == ISD::ANY_EXTEND)
	// (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x)
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);

	// (ext (trunx x)) -> x
	if (OpOpcode == ISD::TRUNCATE) {
	SDValue OpOp = Operand.getOperand(0);
	if (OpOp.getValueType() == VT)
	return OpOp;
	}
	break;
	case ISD::TRUNCATE:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid TRUNCATE!");
	if (Operand.getValueType() == VT) return Operand; // noop truncate
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsGT(VT) &&
	"Invalid truncate node, src < dst!");
	if (OpOpcode == ISD::TRUNCATE)
	return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
	if (OpOpcode == ISD::ZERO_EXTEND \|\| OpOpcode == ISD::SIGN_EXTEND \|\|
	OpOpcode == ISD::ANY_EXTEND) {
	// If the source is smaller than the dest, we still need an extend.
	if (Operand.getOperand(0).getValueType().getScalarType()
	.bitsLT(VT.getScalarType()))
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	if (Operand.getOperand(0).getValueType().bitsGT(VT))
	return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
	return Operand.getOperand(0);
	}
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::ABS:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid ABS!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BSWAP:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid BSWAP!");
	assert((VT.getScalarSizeInBits() % 16 == 0) &&
	"BSWAP types must be a multiple of 16 bits!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BITREVERSE:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid BITREVERSE!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BITCAST:
	// Basic sanity checking.
	assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
	"Cannot BITCAST between types of different sizes!");
	if (VT == Operand.getValueType()) return Operand; // noop conversion.
	if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x)
	return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::SCALAR_TO_VECTOR:
	assert(VT.isVector() && !Operand.getValueType().isVector() &&
	(VT.getVectorElementType() == Operand.getValueType() \|\|
	(VT.getVectorElementType().isInteger() &&
	Operand.getValueType().isInteger() &&
	VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
	"Illegal SCALAR_TO_VECTOR node!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	// scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
	if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Operand.getOperand(1)) &&
	Operand.getConstantOperandVal(1) == 0 &&
	Operand.getOperand(0).getValueType() == VT)
	return Operand.getOperand(0);
	break;
	case ISD::FNEG:
	// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
	if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
	// FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
	return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
	Operand.getOperand(0), Operand.getNode()->getFlags());
	if (OpOpcode == ISD::FNEG) // --X -> X
	return Operand.getOperand(0);
	break;
	case ISD::FABS:
	if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
	return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
	break;
	}

	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {Operand};
	if (VT != MVT::Glue) { // Don't CSE flag producing nodes
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
	const APInt &C2) {
	switch (Opcode) {
	case ISD::ADD: return std::make_pair(C1 + C2, true);
	case ISD::SUB: return std::make_pair(C1 - C2, true);
	case ISD::MUL: return std::make_pair(C1 * C2, true);
	case ISD::AND: return std::make_pair(C1 & C2, true);
	case ISD::OR: return std::make_pair(C1 \| C2, true);
	case ISD::XOR: return std::make_pair(C1 ^ C2, true);
	case ISD::SHL: return std::make_pair(C1 << C2, true);
	case ISD::SRL: return std::make_pair(C1.lshr(C2), true);
	case ISD::SRA: return std::make_pair(C1.ashr(C2), true);
	case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
	case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
	case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
	case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
	case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
	case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
	case ISD::UDIV:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.udiv(C2), true);
	case ISD::UREM:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.urem(C2), true);
	case ISD::SDIV:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.sdiv(C2), true);
	case ISD::SREM:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.srem(C2), true);
	}
	return std::make_pair(APInt(1, 0), false);
	}

	SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
	EVT VT, const ConstantSDNode *Cst1,
	const ConstantSDNode *Cst2) {
	if (Cst1->isOpaque() \|\| Cst2->isOpaque())
	return SDValue();

	std::pair<APInt, bool> Folded = FoldValue(Opcode, Cst1->getAPIntValue(),
	Cst2->getAPIntValue());
	if (!Folded.second)
	return SDValue();
	return getConstant(Folded.first, DL, VT);
	}

	SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
	const GlobalAddressSDNode *GA,
	const SDNode *N2) {
	if (GA->getOpcode() != ISD::GlobalAddress)
	return SDValue();
	if (!TLI->isOffsetFoldingLegal(GA))
	return SDValue();
	const ConstantSDNode *Cst2 = dyn_cast<ConstantSDNode>(N2);
	if (!Cst2)
	return SDValue();
	int64_t Offset = Cst2->getSExtValue();
	switch (Opcode) {
	case ISD::ADD: break;
	case ISD::SUB: Offset = -uint64_t(Offset); break;
	default: return SDValue();
	}
	return getGlobalAddress(GA->getGlobal(), SDLoc(Cst2), VT,
	GA->getOffset() + uint64_t(Offset));
	}

	bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
	switch (Opcode) {
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	// If a divisor is zero/undef or any element of a divisor vector is
	// zero/undef, the whole op is undef.
	assert(Ops.size() == 2 && "Div/rem should have 2 operands");
	SDValue Divisor = Ops[1];
	if (Divisor.isUndef() \|\| isNullConstant(Divisor))
	return true;

	return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
	llvm::any_of(Divisor->op_values(),
	[](SDValue V) { return V.isUndef() \|\|
	isNullConstant(V); });
	// TODO: Handle signed overflow.
	}
	// TODO: Handle oversized shifts.
	default:
	return false;
	}
	}

	SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
	EVT VT, SDNode *Cst1,
	SDNode *Cst2) {
	// If the opcode is a target-specific ISD node, there's nothing we can
	// do here and the operand rules may not line up with the below, so
	// bail early.
	if (Opcode >= ISD::BUILTIN_OP_END)
	return SDValue();

	if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
	return getUNDEF(VT);

	// Handle the case of two scalars.
	if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
	if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
	SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2);
	assert((!Folded \|\| !VT.isVector()) &&
	"Can't fold vectors ops with scalar operands");
	return Folded;
	}
	}

	// fold (add Sym, c) -> Sym+c
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
	return FoldSymbolOffset(Opcode, VT, GA, Cst2);
	if (TLI->isCommutativeBinOp(Opcode))
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
	return FoldSymbolOffset(Opcode, VT, GA, Cst1);

	// For vectors extract each constant element into Inputs so we can constant
	// fold them individually.
	BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
	BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
	if (!BV1 \|\| !BV2)
	return SDValue();

	assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");

	EVT SVT = VT.getScalarType();
	EVT LegalSVT = SVT;
	if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
	LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
	if (LegalSVT.bitsLT(SVT))
	return SDValue();
	}
	SmallVector<SDValue, 4> Outputs;
	for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
	SDValue V1 = BV1->getOperand(I);
	SDValue V2 = BV2->getOperand(I);

	if (SVT.isInteger()) {
	if (V1->getValueType(0).bitsGT(SVT))
	V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
	if (V2->getValueType(0).bitsGT(SVT))
	V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
	}

	if (V1->getValueType(0) != SVT \|\| V2->getValueType(0) != SVT)
	return SDValue();

	// Fold one vector element.
	SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
	if (LegalSVT != SVT)
	ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);

	// Scalar folding only succeeded if the result is a constant or UNDEF.
	if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
	ScalarResult.getOpcode() != ISD::ConstantFP)
	return SDValue();
	Outputs.push_back(ScalarResult);
	}

	assert(VT.getVectorNumElements() == Outputs.size() &&
	"Vector size mismatch!");

	// We may have a vector type but a scalar result. Create a splat.
	Outputs.resize(VT.getVectorNumElements(), Outputs.back());

	// Build a big vector out of the scalar elements we generated.
	return getBuildVector(VT, SDLoc(), Outputs);
	}

	// TODO: Merge with FoldConstantArithmetic
	SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
	const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags) {
	// If the opcode is a target-specific ISD node, there's nothing we can
	// do here and the operand rules may not line up with the below, so
	// bail early.
	if (Opcode >= ISD::BUILTIN_OP_END)
	return SDValue();

	if (isUndef(Opcode, Ops))
	return getUNDEF(VT);

	// We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
	if (!VT.isVector())
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();

	auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
	return !Op.getValueType().isVector() \|\|
	Op.getValueType().getVectorNumElements() == NumElts;
	};

	auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
	return (Op.isUndef()) \|\| (Op.getOpcode() == ISD::CONDCODE) \|\|
	(BV && BV->isConstant());
	};

	// All operands must be vector types with the same number of elements as
	// the result type and must be either UNDEF or a build vector of constant
	// or UNDEF scalars.
	if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) \|\|
	!llvm::all_of(Ops, IsScalarOrSameVectorSize))
	return SDValue();

	// If we are comparing vectors, then the result needs to be a i1 boolean
	// that is then sign-extended back to the legal result type.
	EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());

	// Find legal integer scalar type for constant promotion and
	// ensure that its scalar size is at least as large as source.
	EVT LegalSVT = VT.getScalarType();
	if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
	LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
	if (LegalSVT.bitsLT(VT.getScalarType()))
	return SDValue();
	}

	// Constant fold each scalar lane separately.
	SmallVector<SDValue, 4> ScalarResults;
	for (unsigned i = 0; i != NumElts; i++) {
	SmallVector<SDValue, 4> ScalarOps;
	for (SDValue Op : Ops) {
	EVT InSVT = Op.getValueType().getScalarType();
	BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
	if (!InBV) {
	// We've checked that this is UNDEF or a constant of some kind.
	if (Op.isUndef())
	ScalarOps.push_back(getUNDEF(InSVT));
	else
	ScalarOps.push_back(Op);
	continue;
	}

	SDValue ScalarOp = InBV->getOperand(i);
	EVT ScalarVT = ScalarOp.getValueType();

	// Build vector (integer) scalar operands may need implicit
	// truncation - do this before constant folding.
	if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
	ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);

	ScalarOps.push_back(ScalarOp);
	}

	// Constant fold the scalar operands.
	SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);

	// Legalize the (integer) scalar constant if necessary.
	if (LegalSVT != SVT)
	ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);

	// Scalar folding only succeeded if the result is a constant or UNDEF.
	if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
	ScalarResult.getOpcode() != ISD::ConstantFP)
	return SDValue();
	ScalarResults.push_back(ScalarResult);
	}

	SDValue V = getBuildVector(VT, DL, ScalarResults);
	NewSDValueDbgMsg(V, "New node fold constant vector: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, const SDNodeFlags Flags) {
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);

	// Canonicalize constant to RHS if commutative.
	if (TLI->isCommutativeBinOp(Opcode)) {
	if (N1C && !N2C) {
	std::swap(N1C, N2C);
	std::swap(N1, N2);
	} else if (N1CFP && !N2CFP) {
	std::swap(N1CFP, N2CFP);
	std::swap(N1, N2);
	}
	}

	switch (Opcode) {
	default: break;
	case ISD::TokenFactor:
	assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
	N2.getValueType() == MVT::Other && "Invalid token factor!");
	// Fold trivial token factors.
	if (N1.getOpcode() == ISD::EntryToken) return N2;
	if (N2.getOpcode() == ISD::EntryToken) return N1;
	if (N1 == N2) return N1;
	break;
	case ISD::CONCAT_VECTORS: {
	// Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
	SDValue Ops[] = {N1, N2};
	if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::AND:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	// (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's
	// worth handling here.
	if (N2C && N2C->isNullValue())
	return N2;
	if (N2C && N2C->isAllOnesValue()) // X & -1 -> X
	return N1;
	break;
	case ISD::OR:
	case ISD::XOR:
	case ISD::ADD:
	case ISD::SUB:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	// (X ^\|+- 0) -> X. This commonly occurs when legalizing i64 values, so
	// it's worth handling here.
	if (N2C && N2C->isNullValue())
	return N1;
	break;
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::MULHU:
	case ISD::MULHS:
	case ISD::MUL:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	if (getTarget().Options.UnsafeFPMath) {
	if (Opcode == ISD::FADD) {
	// x+0 --> x
	if (N2CFP && N2CFP->getValueAPF().isZero())
	return N1;
	} else if (Opcode == ISD::FSUB) {
	// x-0 --> x
	if (N2CFP && N2CFP->getValueAPF().isZero())
	return N1;
	} else if (Opcode == ISD::FMUL) {
	// x*0 --> 0
	if (N2CFP && N2CFP->isZero())
	return N2;
	// x*1 --> x
	if (N2CFP && N2CFP->isExactlyValue(1.0))
	return N1;
	}
	}
	assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	break;
	case ISD::FCOPYSIGN: // N1 and result must match. N1/N2 need not match.
	assert(N1.getValueType() == VT &&
	N1.getValueType().isFloatingPoint() &&
	N2.getValueType().isFloatingPoint() &&
	"Invalid FCOPYSIGN!");
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::ROTL:
	case ISD::ROTR:
	assert(VT == N1.getValueType() &&
	"Shift operators return type must be the same as their first arg");
	assert(VT.isInteger() && N2.getValueType().isInteger() &&
	"Shifts only work on integers");
	assert((!VT.isVector() \|\| VT == N2.getValueType()) &&
	"Vector shift amounts must be in the same as their first arg");
	// Verify that the shift amount VT is bit enough to hold valid shift
	// amounts. This catches things like trying to shift an i1024 value by an
	// i8, which is easy to fall into in generic code that uses
	// TLI.getShiftAmount().
	assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
	"Invalid use of small shift amount with oversized value!");

	// Always fold shifts of i1 values so the code generator doesn't need to
	// handle them. Since we know the size of the shift has to be less than the
	// size of the value, the shift/rotate count is guaranteed to be zero.
	if (VT == MVT::i1)
	return N1;
	if (N2C && N2C->isNullValue())
	return N1;
	break;
	case ISD::FP_ROUND_INREG: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg round!");
	assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
	"Cannot FP_ROUND_INREG integer types");
	assert(EVT.isVector() == VT.isVector() &&
	"FP_ROUND_INREG type should be vector iff the operand "
	"type is vector!");
	assert((!EVT.isVector() \|\|
	EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
	"Vector element counts must match in FP_ROUND_INREG");
	assert(EVT.bitsLE(VT) && "Not rounding down!");
	(void)EVT;
	if (cast<VTSDNode>(N2)->getVT() == VT) return N1; // Not actually rounding.
	break;
	}
	case ISD::FP_ROUND:
	assert(VT.isFloatingPoint() &&
	N1.getValueType().isFloatingPoint() &&
	VT.bitsLE(N1.getValueType()) &&
	N2C && (N2C->getZExtValue() == 0 \|\| N2C->getZExtValue() == 1) &&
	"Invalid FP_ROUND!");
	if (N1.getValueType() == VT) return N1; // noop conversion.
	break;
	case ISD::AssertSext:
	case ISD::AssertZext: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg extend!");
	assert(VT.isInteger() && EVT.isInteger() &&
	"Cannot *_EXTEND_INREG FP types");
	assert(!EVT.isVector() &&
	"AssertSExt/AssertZExt type should be the vector element type "
	"rather than the vector type!");
	assert(EVT.bitsLE(VT) && "Not extending!");
	if (VT == EVT) return N1; // noop assertion.
	break;
	}
	case ISD::SIGN_EXTEND_INREG: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg extend!");
	assert(VT.isInteger() && EVT.isInteger() &&
	"Cannot *_EXTEND_INREG FP types");
	assert(EVT.isVector() == VT.isVector() &&
	"SIGN_EXTEND_INREG type should be vector iff the operand "
	"type is vector!");
	assert((!EVT.isVector() \|\|
	EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
	"Vector element counts must match in SIGN_EXTEND_INREG");
	assert(EVT.bitsLE(VT) && "Not extending!");
	if (EVT == VT) return N1; // Not actually extending

	auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
	unsigned FromBits = EVT.getScalarSizeInBits();
	Val <<= Val.getBitWidth() - FromBits;
	Val.ashrInPlace(Val.getBitWidth() - FromBits);
	return getConstant(Val, DL, ConstantVT);
	};

	if (N1C) {
	const APInt &Val = N1C->getAPIntValue();
	return SignExtendInReg(Val, VT);
	}
	if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
	SmallVector<SDValue, 8> Ops;
	llvm::EVT OpVT = N1.getOperand(0).getValueType();
	for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N1.getOperand(i);
	if (Op.isUndef()) {
	Ops.push_back(getUNDEF(OpVT));
	continue;
	}
	ConstantSDNode *C = cast<ConstantSDNode>(Op);
	APInt Val = C->getAPIntValue();
	Ops.push_back(SignExtendInReg(Val, OpVT));
	}
	return getBuildVector(VT, DL, Ops);
	}
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT:
	// EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
	if (N1.isUndef())
	return getUNDEF(VT);

	// EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
	if (N2C && N2C->getZExtValue() >= N1.getValueType().getVectorNumElements())
	return getUNDEF(VT);

	// EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
	// expanding copies of large vectors from registers.
	if (N2C &&
	N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N1.getNumOperands() > 0) {
	unsigned Factor =
	N1.getOperand(0).getValueType().getVectorNumElements();
	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	N1.getOperand(N2C->getZExtValue() / Factor),
	getConstant(N2C->getZExtValue() % Factor, DL,
	N2.getValueType()));
	}

	// EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
	// expanding large vector constants.
	if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
	SDValue Elt = N1.getOperand(N2C->getZExtValue());

	if (VT != Elt.getValueType())
	// If the vector element type is not legal, the BUILD_VECTOR operands
	// are promoted and implicitly truncated, and the result implicitly
	// extended. Make that explicit here.
	Elt = getAnyExtOrTrunc(Elt, DL, VT);

	return Elt;
	}

	// EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
	// operations are lowered to scalars.
	if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	// If the indices are the same, return the inserted element else
	// if the indices are known different, extract the element from
	// the original vector.
	SDValue N1Op2 = N1.getOperand(2);
	ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);

	if (N1Op2C && N2C) {
	if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
	if (VT == N1.getOperand(1).getValueType())
	return N1.getOperand(1);
	else
	return getSExtOrTrunc(N1.getOperand(1), DL, VT);
	}

	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
	}
	}

	// EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
	// when vector types are scalarized and v1iX is legal.
	// vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
	if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getValueType().getVectorNumElements() == 1) {
	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
	N1.getOperand(1));
	}
	break;
	case ISD::EXTRACT_ELEMENT:
	assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
	assert(!N1.getValueType().isVector() && !VT.isVector() &&
	(N1.getValueType().isInteger() == VT.isInteger()) &&
	N1.getValueType() != VT &&
	"Wrong types for EXTRACT_ELEMENT!");

	// EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
	// 64-bit integers into 32-bit parts. Instead of building the extract of
	// the BUILD_PAIR, only to have legalize rip it apart, just do it now.
	if (N1.getOpcode() == ISD::BUILD_PAIR)
	return N1.getOperand(N2C->getZExtValue());

	// EXTRACT_ELEMENT of a constant int is also very common.
	if (N1C) {
	unsigned ElementSize = VT.getSizeInBits();
	unsigned Shift = ElementSize * N2C->getZExtValue();
	APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
	return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
	}
	break;
	case ISD::EXTRACT_SUBVECTOR:
	if (VT.isSimple() && N1.getValueType().isSimple()) {
	assert(VT.isVector() && N1.getValueType().isVector() &&
	"Extract subvector VTs must be a vectors!");
	assert(VT.getVectorElementType() ==
	N1.getValueType().getVectorElementType() &&
	"Extract subvector VTs must have the same element type!");
	assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
	"Extract subvector must be from larger vector to smaller vector!");

	if (N2C) {
	assert((VT.getVectorNumElements() + N2C->getZExtValue()
	<= N1.getValueType().getVectorNumElements())
	&& "Extract subvector overflow!");
	}

	// Trivial extraction.
	if (VT.getSimpleVT() == N1.getSimpleValueType())
	return N1;

	// EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
	if (N1.isUndef())
	return getUNDEF(VT);

	// EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
	// the concat have the same type as the extract.
	if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N1.getNumOperands() > 0 &&
	VT == N1.getOperand(0).getValueType()) {
	unsigned Factor = VT.getVectorNumElements();
	return N1.getOperand(N2C->getZExtValue() / Factor);
	}

	// EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
	// during shuffle legalization.
	if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
	VT == N1.getOperand(1).getValueType())
	return N1.getOperand(1);
	}
	break;
	}

	// Perform trivial constant folding.
	if (SDValue SV =
	FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
	return SV;

	// Constant fold FP operations.
	bool HasFPExceptions = TLI->hasFloatingPointExceptions();
	if (N1CFP) {
	if (N2CFP) {
	APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
	APFloat::opStatus s;
	switch (Opcode) {
	case ISD::FADD:
	s = V1.add(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| s != APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	break;
	case ISD::FSUB:
	s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| s!=APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	break;
	case ISD::FMUL:
	s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| s!=APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	break;
	case ISD::FDIV:
	s = V1.divide(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| (s!=APFloat::opInvalidOp &&
	s!=APFloat::opDivByZero)) {
	return getConstantFP(V1, DL, VT);
	}
	break;
	case ISD::FREM :
	s = V1.mod(V2);
	if (!HasFPExceptions \|\| (s!=APFloat::opInvalidOp &&
	s!=APFloat::opDivByZero)) {
	return getConstantFP(V1, DL, VT);
	}
	break;
	case ISD::FCOPYSIGN:
	V1.copySign(V2);
	return getConstantFP(V1, DL, VT);
	default: break;
	}
	}

	if (Opcode == ISD::FP_ROUND) {
	APFloat V = N1CFP->getValueAPF(); // make copy
	bool ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &ignored);
	return getConstantFP(V, DL, VT);
	}
	}

	// Canonicalize an UNDEF to the RHS, even over a constant.
	if (N1.isUndef()) {
	if (TLI->isCommutativeBinOp(Opcode)) {
	std::swap(N1, N2);
	} else {
	switch (Opcode) {
	case ISD::FP_ROUND_INREG:
	case ISD::SIGN_EXTEND_INREG:
	case ISD::SUB:
	case ISD::FSUB:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::SRA:
	return N1; // fold op(undef, arg2) -> undef
	case ISD::UDIV:
	case ISD::SDIV:
	case ISD::UREM:
	case ISD::SREM:
	case ISD::SRL:
	case ISD::SHL:
	if (!VT.isVector())
	return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0
	// For vectors, we can't easily build an all zero vector, just return
	// the LHS.
	return N2;
	}
	}
	}

	// Fold a bunch of operators when the RHS is undef.
	if (N2.isUndef()) {
	switch (Opcode) {
	case ISD::XOR:
	if (N1.isUndef())
	// Handle undef ^ undef -> 0 special case. This is a common
	// idiom (misuse).
	return getConstant(0, DL, VT);
	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUB:
	case ISD::UDIV:
	case ISD::SDIV:
	case ISD::UREM:
	case ISD::SREM:
	return N2; // fold op(arg1, undef) -> undef
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	if (getTarget().Options.UnsafeFPMath)
	return N2;
	break;
	case ISD::MUL:
	case ISD::AND:
	case ISD::SRL:
	case ISD::SHL:
	if (!VT.isVector())
	return getConstant(0, DL, VT); // fold op(arg1, undef) -> 0
	// For vectors, we can't easily build an all zero vector, just return
	// the LHS.
	return N1;
	case ISD::OR:
	if (!VT.isVector())
	return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
	// For vectors, we can't easily build an all one vector, just return
	// the LHS.
	return N1;
	case ISD::SRA:
	return N1;
	}
	}

	// Memoize this node if possible.
	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {N1, N2};
	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3) {
	// Perform various simplifications.
	switch (Opcode) {
	case ISD::FMA: {
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
	ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
	if (N1CFP && N2CFP && N3CFP) {
	APFloat V1 = N1CFP->getValueAPF();
	const APFloat &V2 = N2CFP->getValueAPF();
	const APFloat &V3 = N3CFP->getValueAPF();
	APFloat::opStatus s =
	V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
	if (!TLI->hasFloatingPointExceptions() \|\| s != APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	// Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::SETCC: {
	// Use FoldSetCC to simplify SETCC's.
	if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
	return V;
	// Vector constant folding.
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) {
	NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
	return V;
	}
	break;
	}
	case ISD::SELECT:
	if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
	if (N1C->getZExtValue())
	return N2; // select true, X, Y -> X
	return N3; // select false, X, Y -> Y
	}

	if (N2 == N3) return N2; // select C, X, X -> X
	break;
	case ISD::VECTOR_SHUFFLE:
	llvm_unreachable("should use getVectorShuffle constructor!");
	case ISD::INSERT_VECTOR_ELT: {
	ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
	// INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
	if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
	return getUNDEF(VT);
	break;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Index = N3;
	if (VT.isSimple() && N1.getValueType().isSimple()
	&& N2.getValueType().isSimple()) {
	assert(VT.isVector() && N1.getValueType().isVector() &&
	N2.getValueType().isVector() &&
	"Insert subvector VTs must be a vectors");
	assert(VT == N1.getValueType() &&
	"Dest and insert subvector source types must match!");
	assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
	"Insert subvector must be from smaller vector to larger vector!");
	if (isa<ConstantSDNode>(Index)) {
	assert((N2.getValueType().getVectorNumElements() +
	cast<ConstantSDNode>(Index)->getZExtValue()
	<= VT.getVectorNumElements())
	&& "Insert subvector overflow!");
	}

	// Trivial insertion.
	if (VT.getSimpleVT() == N2.getSimpleValueType())
	return N2;
	}
	break;
	}
	case ISD::BITCAST:
	// Fold bit_convert nodes from a type to themselves.
	if (N1.getValueType() == VT)
	return N1;
	break;
	}

	// Memoize node if it doesn't produce a flag.
	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {N1, N2, N3};
	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
	SDValue Ops[] = { N1, N2, N3, N4 };
	return getNode(Opcode, DL, VT, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4,
	SDValue N5) {
	SDValue Ops[] = { N1, N2, N3, N4, N5 };
	return getNode(Opcode, DL, VT, Ops);
	}

	/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
	/// the incoming stack arguments to be loaded from the stack.
	SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
	SmallVector<SDValue, 8> ArgChains;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument.
	for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
	UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0)
	ArgChains.push_back(SDValue(L, 1));

	// Build a tokenfactor for all the chains.
	return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	/// getMemsetValue - Vectorized representation of the memset value
	/// operand.
	static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(!Value.isUndef());

	unsigned NumBits = VT.getScalarSizeInBits();
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
	assert(C->getAPIntValue().getBitWidth() == 8);
	APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
	if (VT.isInteger())
	return DAG.getConstant(Val, dl, VT);
	return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
	VT);
	}

	assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
	EVT IntVT = VT.getScalarType();
	if (!IntVT.isInteger())
	IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());

	Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
	if (NumBits > 8) {
	// Use a multiplication with 0x010101... to extend the input to the
	// required length.
	APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
	Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
	DAG.getConstant(Magic, dl, IntVT));
	}

	if (VT != Value.getValueType() && !VT.isInteger())
	Value = DAG.getBitcast(VT.getScalarType(), Value);
	if (VT != Value.getValueType())
	Value = DAG.getSplatBuildVector(VT, dl, Value);

	return Value;
	}

	/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
	/// used when a memcpy is turned into a memset when the source is a constant
	/// string ptr.
	static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
	const TargetLowering &TLI,
	const ConstantDataArraySlice &Slice) {
	// Handle vector with all elements zero.
	if (Slice.Array == nullptr) {
	if (VT.isInteger())
	return DAG.getConstant(0, dl, VT);
	else if (VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f128)
	return DAG.getConstantFP(0.0, dl, VT);
	else if (VT.isVector()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getConstant(0, dl,
	EVT::getVectorVT(*DAG.getContext(),
	EltVT, NumElts)));
	} else
	llvm_unreachable("Expected type!");
	}

	assert(!VT.isVector() && "Can't handle vector type here!");
	unsigned NumVTBits = VT.getSizeInBits();
	unsigned NumVTBytes = NumVTBits / 8;
	unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));

	APInt Val(NumVTBits, 0);
	if (DAG.getDataLayout().isLittleEndian()) {
	for (unsigned i = 0; i != NumBytes; ++i)
	Val \|= (uint64_t)(unsigned char)Slice[i] << i*8;
	} else {
	for (unsigned i = 0; i != NumBytes; ++i)
	Val \|= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
	}

	// If the "cost" of materializing the integer immediate is less than the cost
	// of a load, then it is cost effective to turn the load into the immediate.
	Type Ty = VT.getTypeForEVT(DAG.getContext());
	if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
	return DAG.getConstant(Val, dl, VT);
	return SDValue(nullptr, 0);
	}

	SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
	const SDLoc &DL) {
	EVT VT = Base.getValueType();
	return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
	}

	/// Returns true if memcpy source is constant data.
	static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
	uint64_t SrcDelta = 0;
	GlobalAddressSDNode *G = nullptr;
	if (Src.getOpcode() == ISD::GlobalAddress)
	G = cast<GlobalAddressSDNode>(Src);
	else if (Src.getOpcode() == ISD::ADD &&
	Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
	Src.getOperand(1).getOpcode() == ISD::Constant) {
	G = cast<GlobalAddressSDNode>(Src.getOperand(0));
	SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
	}
	if (!G)
	return false;

	return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
	SrcDelta + G->getOffset());
	}

	/// Determines the optimal series of memory ops to replace the memset / memcpy.
	/// Return true if the number of memory ops is below the threshold (Limit).
	/// It returns the types of the sequence of memory ops to perform
	/// memset / memcpy by reference.
	static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
	unsigned Limit, uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	bool AllowOverlap,
	unsigned DstAS, unsigned SrcAS,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	assert((SrcAlign == 0 \|\| SrcAlign >= DstAlign) &&
	"Expecting memcpy / memset source to meet alignment requirement!");
	// If 'SrcAlign' is zero, that means the memory operation does not need to
	// load the value, i.e. memset or memcpy from constant string. Otherwise,
	// it's the inferred alignment of the source. 'DstAlign', on the other hand,
	// is the specified alignment of the memory operation. If it is zero, that
	// means it's possible to change the alignment of the destination.
	// 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
	// not need to be loaded.
	EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
	IsMemset, ZeroMemset, MemcpyStrSrc,
	DAG.getMachineFunction());

	if (VT == MVT::Other) {
	// Use the largest integer type whose alignment constraints are satisfied.
	// We only need to check DstAlign here as SrcAlign is always greater or
	// equal to DstAlign (or zero).
	VT = MVT::i64;
	while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
	!TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
	VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
	assert(VT.isInteger());

	// Find the largest legal integer type.
	MVT LVT = MVT::i64;
	while (!TLI.isTypeLegal(LVT))
	LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
	assert(LVT.isInteger());

	// If the type we've chosen is larger than the largest legal integer type
	// then use that instead.
	if (VT.bitsGT(LVT))
	VT = LVT;
	}

	unsigned NumMemOps = 0;
	while (Size != 0) {
	unsigned VTSize = VT.getSizeInBits() / 8;
	while (VTSize > Size) {
	// For now, only use non-vector load / store's for the left-over pieces.
	EVT NewVT = VT;
	unsigned NewVTSize;

	bool Found = false;
	if (VT.isVector() \|\| VT.isFloatingPoint()) {
	NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
	if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
	TLI.isSafeMemOpType(NewVT.getSimpleVT()))
	Found = true;
	else if (NewVT == MVT::i64 &&
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
	TLI.isSafeMemOpType(MVT::f64)) {
	// i64 is usually not legal on 32-bit targets, but f64 may be.
	NewVT = MVT::f64;
	Found = true;
	}
	}

	if (!Found) {
	do {
	NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
	if (NewVT == MVT::i8)
	break;
	} while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
	}
	NewVTSize = NewVT.getSizeInBits() / 8;

	// If the new VT cannot cover all of the remaining bits, then consider
	// issuing a (or a pair of) unaligned and overlapping load / store.
	// FIXME: Only does this for 64-bit or more since we don't have proper
	// cost model for unaligned load / store.
	bool Fast;
	if (NumMemOps && AllowOverlap &&
	VTSize >= 8 && NewVTSize < Size &&
	TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)
	VTSize = Size;
	else {
	VT = NewVT;
	VTSize = NewVTSize;
	}
	}

	if (++NumMemOps > Limit)
	return false;

	MemOps.push_back(VT);
	Size -= VTSize;
	}

	return true;
	}

	static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
	// On Darwin, -Os means optimize for size without hurting performance, so
	// only really optimize for size when -Oz (MinSize) is used.
	if (MF.getTarget().getTargetTriple().isOSDarwin())
	return MF.getFunction().optForMinSize();
	return MF.getFunction().optForSize();
	}

	static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align,
	bool isVol, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Turn a memcpy of undef to nop.
	if (Src.isUndef())
	return Chain;

	// Expand memcpy to a series of load and store ops if the size operand falls
	// below a certain threshold.
	// TODO: In the AlwaysInline case, if the size is big then generate a loop
	// rather than maybe a humongous number of loads and stores.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	LLVMContext &C = *DAG.getContext();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	if (Align > SrcAlign)
	SrcAlign = Align;
	ConstantDataArraySlice Slice;
	bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
	bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
	unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);

	if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
	(DstAlignCanChange ? 0 : Align),
	(isZeroConstant ? 0 : SrcAlign),
	false, false, CopyFromConstant, true,
	DstPtrInfo.getAddrSpace(),
	SrcPtrInfo.getAddrSpace(),
	DAG, TLI))
	return SDValue();

	if (DstAlignCanChange) {
	Type *Ty = MemOps[0].getTypeForEVT(C);
	unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);

	// Don't promote to an alignment that would require dynamic stack
	// realignment.
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	if (!TRI->needsStackRealignment(MF))
	while (NewAlign > Align &&
	DL.exceedsNaturalStackAlignment(NewAlign))
	NewAlign /= 2;

	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	MachineMemOperand::Flags MMOFlags =
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
	SmallVector<SDValue, 8> OutChains;
	unsigned NumMemOps = MemOps.size();
	uint64_t SrcOff = 0, DstOff = 0;
	for (unsigned i = 0; i != NumMemOps; ++i) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Value, Store;

	if (VTSize > Size) {
	// Issuing an unaligned load / store pair that overlaps with the previous
	// pair. Adjust the offset accordingly.
	assert(i == NumMemOps-1 && i != 0);
	SrcOff -= VTSize - Size;
	DstOff -= VTSize - Size;
	}

	if (CopyFromConstant &&
	(isZeroConstant \|\| (VT.isInteger() && !VT.isVector()))) {
	// It's unlikely a store of a vector immediate can be done in a single
	// instruction. It would require a load from a constantpool first.
	// We only handle zero vectors here.
	// FIXME: Handle other cases where store of vector immediate is done in
	// a single instruction.
	ConstantDataArraySlice SubSlice;
	if (SrcOff < Slice.Length) {
	SubSlice = Slice;
	SubSlice.move(SrcOff);
	} else {
	// This is an out-of-bounds access and hence UB. Pretend we read zero.
	SubSlice.Array = nullptr;
	SubSlice.Offset = 0;
	SubSlice.Length = VTSize;
	}
	Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
	if (Value.getNode())
	Store = DAG.getStore(Chain, dl, Value,
	DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align,
	MMOFlags);
	}

	if (!Store.getNode()) {
	// The type might not be legal for the target. This should only happen
	// if the type is smaller than a legal type, as on PPC, so the right
	// thing to do is generate a LoadExt/StoreTrunc pair. These simplify
	// to Load/Store if NVT==VT.
	// FIXME does the case above also need this?
	EVT NVT = TLI.getTypeToTransformTo(C, VT);
	assert(NVT.bitsGE(VT));

	bool isDereferenceable =
	SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
	MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
	if (isDereferenceable)
	SrcMMOFlags \|= MachineMemOperand::MODereferenceable;

	Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
	DAG.getMemBasePlusOffset(Src, SrcOff, dl),
	SrcPtrInfo.getWithOffset(SrcOff), VT,
	MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
	OutChains.push_back(Value.getValue(1));
	Store = DAG.getTruncStore(
	Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
	}
	OutChains.push_back(Store);
	SrcOff += VTSize;
	DstOff += VTSize;
	Size -= VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align,
	bool isVol, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Turn a memmove of undef to nop.
	if (Src.isUndef())
	return Chain;

	// Expand memmove to a series of load and store ops if the size operand falls
	// below a certain threshold.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	LLVMContext &C = *DAG.getContext();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	if (Align > SrcAlign)
	SrcAlign = Align;
	unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);

	if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
	(DstAlignCanChange ? 0 : Align), SrcAlign,
	false, false, false, false,
	DstPtrInfo.getAddrSpace(),
	SrcPtrInfo.getAddrSpace(),
	DAG, TLI))
	return SDValue();

	if (DstAlignCanChange) {
	Type *Ty = MemOps[0].getTypeForEVT(C);
	unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	MachineMemOperand::Flags MMOFlags =
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
	uint64_t SrcOff = 0, DstOff = 0;
	SmallVector<SDValue, 8> LoadValues;
	SmallVector<SDValue, 8> LoadChains;
	SmallVector<SDValue, 8> OutChains;
	unsigned NumMemOps = MemOps.size();
	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Value;

	bool isDereferenceable =
	SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
	MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
	if (isDereferenceable)
	SrcMMOFlags \|= MachineMemOperand::MODereferenceable;

	Value =
	DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
	SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
	LoadValues.push_back(Value);
	LoadChains.push_back(Value.getValue(1));
	SrcOff += VTSize;
	}
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
	OutChains.clear();
	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Store;

	Store = DAG.getStore(Chain, dl, LoadValues[i],
	DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
	OutChains.push_back(Store);
	DstOff += VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	/// \brief Lower the call to 'memset' intrinsic function into a series of store
	/// operations.
	///
	/// \param DAG Selection DAG where lowered code is placed.
	/// \param dl Link to corresponding IR location.
	/// \param Chain Control flow dependency.
	/// \param Dst Pointer to destination memory location.
	/// \param Src Value of byte to write into the memory.
	/// \param Size Number of bytes to write.
	/// \param Align Alignment of the destination in bytes.
	/// \param isVol True if destination is volatile.
	/// \param DstPtrInfo IR information on the memory pointer.
	/// \returns New head in the control flow, if lowering was successful, empty
	/// SDValue otherwise.
	///
	/// The function tries to replace 'llvm.memset' intrinsic with several store
	/// operations and value calculation code. This is usually profitable for small
	/// memory size.
	static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align, bool isVol,
	MachinePointerInfo DstPtrInfo) {
	// Turn a memset of undef to nop.
	if (Src.isUndef())
	return Chain;

	// Expand memset to a series of load/store ops if the size operand
	// falls below a certain threshold.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	bool IsZeroVal =
	isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
	if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
	Size, (DstAlignCanChange ? 0 : Align), 0,
	true, IsZeroVal, false, true,
	DstPtrInfo.getAddrSpace(), ~0u,
	DAG, TLI))
	return SDValue();

	if (DstAlignCanChange) {
	Type Ty = MemOps[0].getTypeForEVT(DAG.getContext());
	unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	SmallVector<SDValue, 8> OutChains;
	uint64_t DstOff = 0;
	unsigned NumMemOps = MemOps.size();

	// Find the largest store and generate the bit pattern for it.
	EVT LargestVT = MemOps[0];
	for (unsigned i = 1; i < NumMemOps; i++)
	if (MemOps[i].bitsGT(LargestVT))
	LargestVT = MemOps[i];
	SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);

	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	if (VTSize > Size) {
	// Issuing an unaligned load / store pair that overlaps with the previous
	// pair. Adjust the offset accordingly.
	assert(i == NumMemOps-1 && i != 0);
	DstOff -= VTSize - Size;
	}

	// If this store is smaller than the largest store see whether we can get
	// the smaller value for free with a truncate.
	SDValue Value = MemSetValue;
	if (VT.bitsLT(LargestVT)) {
	if (!LargestVT.isVector() && !VT.isVector() &&
	TLI.isTruncateFree(LargestVT, VT))
	Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
	else
	Value = getMemsetValue(Src, VT, DAG, dl);
	}
	assert(Value.getValueType() == VT && "Value with wrong type.");
	SDValue Store = DAG.getStore(
	Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align,
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
	OutChains.push_back(Store);
	DstOff += VT.getSizeInBits() / 8;
	Size -= VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
	unsigned AS) {
	// Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
	// pointer operands can be losslessly bitcasted to pointers of address space 0
	if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
	report_fatal_error("cannot lower memory intrinsic in address space " +
	Twine(AS));
	}
	}

	SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool AlwaysInline, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memcpy to loads and stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memcpy with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(),Align,
	isVol, false, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memcpy with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemcpy(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
	DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// If we really need inline code and the target declined to provide it,
	// use a (potentially long) sequence of loads and stores.
	if (AlwaysInline) {
	assert(ConstantSize && "AlwaysInline requires a constant size!");
	return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(), Align, isVol,
	true, DstPtrInfo, SrcPtrInfo);
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
	checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());

	// FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
	// memcpy is not guaranteed to be safe. libc memcpys aren't required to
	// respect volatile, so they may do things like read or write memory
	// beyond the given memory regions. But fixing this isn't easy, and most
	// people don't care.

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst; Args.push_back(Entry);
	Entry.Node = Src; Args.push_back(Entry);
	Entry.Node = Size; Args.push_back(Entry);
	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memmove to loads and stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memmove with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result =
	getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(), Align, isVol,
	false, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memmove with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemmove(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
	checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());

	// FIXME: If the memmove is volatile, lowering it to plain libc memmove may
	// not be safe. See memcpy above for more details.

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst; Args.push_back(Entry);
	Entry.Node = Src; Args.push_back(Entry);
	Entry.Node = Size; Args.push_back(Entry);
	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memset to stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memset with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result =
	getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
	Align, isVol, DstPtrInfo);

	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memset with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemset(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
	if (Result.getNode())
	return Result;
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());

	// Emit a library call.
	Type IntPtrTy = getDataLayout().getIntPtrType(getContext());
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Node = Dst; Entry.Ty = IntPtrTy;
	Args.push_back(Entry);
	Entry.Node = Src;
	Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
	Args.push_back(Entry);
	Entry.Node = Size;
	Entry.Ty = IntPtrTy;
	Args.push_back(Entry);

	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTList, ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	FoldingSetNodeID ID;
	ID.AddInteger(MemVT.getRawBits());
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void* IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<AtomicSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getAtomicCmpSwap(
	unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
	SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
	unsigned Alignment, AtomicOrdering SuccessOrdering,
	AtomicOrdering FailureOrdering, SyncScope::ID SSID) {
	assert(Opcode == ISD::ATOMIC_CMP_SWAP \|\|
	Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
	assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");

	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MachineFunction &MF = getMachineFunction();

	// FIXME: Volatile isn't really correct; we should keep track of atomic
	// orderings in the memoperand.
	auto Flags = MachineMemOperand::MOVolatile \| MachineMemOperand::MOLoad \|
	MachineMemOperand::MOStore;
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
	AAMDNodes(), nullptr, SSID, SuccessOrdering,
	FailureOrdering);

	return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO);
	}

	SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
	EVT MemVT, SDVTList VTs, SDValue Chain,
	SDValue Ptr, SDValue Cmp, SDValue Swp,
	MachineMemOperand *MMO) {
	assert(Opcode == ISD::ATOMIC_CMP_SWAP \|\|
	Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
	assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");

	SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDValue Chain, SDValue Ptr, SDValue Val,
	const Value *PtrVal, unsigned Alignment,
	AtomicOrdering Ordering,
	SyncScope::ID SSID) {
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MachineFunction &MF = getMachineFunction();
	// An atomic store does not load. An atomic load does not store.
	// (An atomicrmw obviously both loads and stores.)
	// For now, atomics are considered to be volatile always, and they are
	// chained as such.
	// FIXME: Volatile isn't really correct; we should keep track of atomic
	// orderings in the memoperand.
	auto Flags = MachineMemOperand::MOVolatile;
	if (Opcode != ISD::ATOMIC_STORE)
	Flags \|= MachineMemOperand::MOLoad;
	if (Opcode != ISD::ATOMIC_LOAD)
	Flags \|= MachineMemOperand::MOStore;

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
	MemVT.getStoreSize(), Alignment, AAMDNodes(),
	nullptr, SSID, Ordering);

	return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDValue Chain, SDValue Ptr, SDValue Val,
	MachineMemOperand *MMO) {
	assert((Opcode == ISD::ATOMIC_LOAD_ADD \|\|
	Opcode == ISD::ATOMIC_LOAD_SUB \|\|
	Opcode == ISD::ATOMIC_LOAD_AND \|\|
	Opcode == ISD::ATOMIC_LOAD_OR \|\|
	Opcode == ISD::ATOMIC_LOAD_XOR \|\|
	Opcode == ISD::ATOMIC_LOAD_NAND \|\|
	Opcode == ISD::ATOMIC_LOAD_MIN \|\|
	Opcode == ISD::ATOMIC_LOAD_MAX \|\|
	Opcode == ISD::ATOMIC_LOAD_UMIN \|\|
	Opcode == ISD::ATOMIC_LOAD_UMAX \|\|
	Opcode == ISD::ATOMIC_SWAP \|\|
	Opcode == ISD::ATOMIC_STORE) &&
	"Invalid Atomic Op");

	EVT VT = Val.getValueType();

	SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
	getVTList(VT, MVT::Other);
	SDValue Ops[] = {Chain, Ptr, Val};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	EVT VT, SDValue Chain, SDValue Ptr,
	MachineMemOperand *MMO) {
	assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");

	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = {Chain, Ptr};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	/// getMergeValues - Create a MERGE_VALUES node from the given operands.
	SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
	if (Ops.size() == 1)
	return Ops[0];

	SmallVector<EVT, 4> VTs;
	VTs.reserve(Ops.size());
	for (unsigned i = 0; i < Ops.size(); ++i)
	VTs.push_back(Ops[i].getValueType());
	return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
	}

	SDValue SelectionDAG::getMemIntrinsicNode(
	unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
	EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
	MachineMemOperand::Flags Flags, unsigned Size) {
	if (Align == 0) // Ensure that codegen never sees alignment 0
	Align = getEVTAlignment(MemVT);

	if (!Size)
	Size = MemVT.getStoreSize();

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);

	return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
	}

	SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
	SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachineMemOperand *MMO) {
	assert((Opcode == ISD::INTRINSIC_VOID \|\|
	Opcode == ISD::INTRINSIC_W_CHAIN \|\|
	Opcode == ISD::PREFETCH \|\|
	Opcode == ISD::LIFETIME_START \|\|
	Opcode == ISD::LIFETIME_END \|\|
	((int)Opcode <= std::numeric_limits<int>::max() &&
	(int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
	"Opcode is not a memory-accessing opcode!");

	// Memoize the node unless it returns a flag.
	MemIntrinsicSDNode *N;
	if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
	Opcode, dl.getIROrder(), VTList, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);
	}
	InsertNode(N);
	return SDValue(N, 0);
	}

	/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
	/// MachinePointerInfo record from it. This is particularly useful because the
	/// code generator has many cases where it doesn't bother passing in a
	/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
	static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
	SelectionDAG &DAG, SDValue Ptr,
	int64_t Offset = 0) {
	// If this is FI+Offset, we can model it.
	if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
	return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	FI->getIndex(), Offset);

	// If this is (FI+Offset1)+Offset2, we can model it.
	if (Ptr.getOpcode() != ISD::ADD \|\|
	!isa<ConstantSDNode>(Ptr.getOperand(1)) \|\|
	!isa<FrameIndexSDNode>(Ptr.getOperand(0)))
	return Info;

	int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	return MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI,
	Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
	}

	/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
	/// MachinePointerInfo record from it. This is particularly useful because the
	/// code generator has many cases where it doesn't bother passing in a
	/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
	static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
	SelectionDAG &DAG, SDValue Ptr,
	SDValue OffsetOp) {
	// If the 'Offset' value isn't a constant, we can't handle this.
	if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
	return InferPointerInfo(Info, DAG, Ptr, OffsetNode->getSExtValue());
	if (OffsetOp.isUndef())
	return InferPointerInfo(Info, DAG, Ptr);
	return Info;
	}

	SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
	EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Offset,
	MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo, const MDNode *Ranges) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MMOFlags \|= MachineMemOperand::MOLoad;
	assert((MMOFlags & MachineMemOperand::MOStore) == 0);
	// If we don't have a PtrInfo, infer the trivial frame index case to simplify
	// clients.
	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
	return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
	}

	SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
	EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Offset, EVT MemVT,
	MachineMemOperand *MMO) {
	if (VT == MemVT) {
	ExtType = ISD::NON_EXTLOAD;
	} else if (ExtType == ISD::NON_EXTLOAD) {
	assert(VT == MemVT && "Non-extending load from different memory type!");
	} else {
	// Extending load.
	assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
	"Should only be an extending load, not truncating!");
	assert(VT.isInteger() == MemVT.isInteger() &&
	"Cannot convert from FP to Int or Int -> FP!");
	assert(VT.isVector() == MemVT.isVector() &&
	"Cannot use an ext load to convert to or from a vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
	"Cannot use an ext load to change the number of vector elements!");
	}

	bool Indexed = AM != ISD::UNINDEXED;
	assert((Indexed \|\| Offset.isUndef()) && "Unindexed load with an offset!");

	SDVTList VTs = Indexed ?
	getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Offset };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
	dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<LoadSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
	ExtType, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo, const MDNode *Ranges) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
	PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
	}

	SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, MachineMemOperand *MMO) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
	VT, MMO);
	}

	SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
	EVT VT, SDValue Chain, SDValue Ptr,
	MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
	MemVT, Alignment, MMOFlags, AAInfo);
	}

	SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
	EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
	MachineMemOperand *MMO) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
	MemVT, MMO);
	}

	SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
	SDValue Base, SDValue Offset,
	ISD::MemIndexedMode AM) {
	LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
	assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
	// Don't propagate the invariant or dereferenceable flags.
	auto MMOFlags =
	LD->getMemOperand()->getFlags() &
	~(MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
	return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
	LD->getChain(), Base, Offset, LD->getPointerInfo(),
	LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
	LD->getAAInfo());
	}

	SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(Val.getValueType());

	MMOFlags \|= MachineMemOperand::MOStore;
	assert((MMOFlags & MachineMemOperand::MOLoad) == 0);

	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
	return getStore(Chain, dl, Val, Ptr, MMO);
	}

	SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachineMemOperand *MMO) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	EVT VT = Val.getValueType();
	SDVTList VTs = getVTList(MVT::Other);
	SDValue Undef = getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
	dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<StoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ISD::UNINDEXED, false, VT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	EVT SVT, unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(SVT);

	MMOFlags \|= MachineMemOperand::MOStore;
	assert((MMOFlags & MachineMemOperand::MOLoad) == 0);

	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
	return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
	}

	SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, EVT SVT,
	MachineMemOperand *MMO) {
	EVT VT = Val.getValueType();

	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (VT == SVT)
	return getStore(Chain, dl, Val, Ptr, MMO);

	assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
	"Should only be a truncating store, not extending!");
	assert(VT.isInteger() == SVT.isInteger() &&
	"Can't do FP-INT conversion!");
	assert(VT.isVector() == SVT.isVector() &&
	"Cannot use trunc store to convert to or from a vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
	"Cannot use trunc store to change the number of vector elements!");

	SDVTList VTs = getVTList(MVT::Other);
	SDValue Undef = getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(SVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
	dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<StoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ISD::UNINDEXED, true, SVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
	SDValue Base, SDValue Offset,
	ISD::MemIndexedMode AM) {
	StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
	assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
	SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
	SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(ST->getMemoryVT().getRawBits());
	ID.AddInteger(ST->getRawSubclassData());
	ID.AddInteger(ST->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
	ST->isTruncatingStore(), ST->getMemoryVT(),
	ST->getMemOperand());
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Mask, SDValue Src0,
	EVT MemVT, MachineMemOperand *MMO,
	ISD::LoadExtType ExtTy, bool isExpanding) {
	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
	dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ExtTy, isExpanding, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
	SDValue Val, SDValue Ptr, SDValue Mask,
	EVT MemVT, MachineMemOperand *MMO,
	bool IsTruncating, bool IsCompressing) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	EVT VT = Val.getValueType();
	SDVTList VTs = getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
	dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	IsTruncating, IsCompressing, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	assert(Ops.size() == 5 && "Incompatible number of operands");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
	dl.getIROrder(), VTs, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VTs, VT, MMO);
	createOperands(N, Ops);

	assert(N->getValue().getValueType() == N->getValueType(0) &&
	"Incompatible type of the PassThru value in MaskedGatherSDNode");
	assert(N->getMask().getValueType().getVectorNumElements() ==
	N->getValueType(0).getVectorNumElements() &&
	"Vector width mismatch between mask and data");
	assert(N->getIndex().getValueType().getVectorNumElements() ==
	N->getValueType(0).getVectorNumElements() &&
	"Vector width mismatch between index and data");

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	assert(Ops.size() == 5 && "Incompatible number of operands");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
	dl.getIROrder(), VTs, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VTs, VT, MMO);
	createOperands(N, Ops);

	assert(N->getMask().getValueType().getVectorNumElements() ==
	N->getValue().getValueType().getVectorNumElements() &&
	"Vector width mismatch between mask and data");
	assert(N->getIndex().getValueType().getVectorNumElements() ==
	N->getValue().getValueType().getVectorNumElements() &&
	"Vector width mismatch between index and data");

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue SV, unsigned Align) {
	SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
	return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDUse> Ops) {
	switch (Ops.size()) {
	case 0: return getNode(Opcode, DL, VT);
	case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
	case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
	case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
	default: break;
	}

	// Copy from an SDUse array into an SDValue array for use with
	// the regular getNode logic.
	SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
	return getNode(Opcode, DL, VT, NewOps);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
	unsigned NumOps = Ops.size();
	switch (NumOps) {
	case 0: return getNode(Opcode, DL, VT);
	case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
	case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
	case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
	default: break;
	}

	switch (Opcode) {
	default: break;
	case ISD::CONCAT_VECTORS:
	// Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
	if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	case ISD::SELECT_CC:
	assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
	assert(Ops[0].getValueType() == Ops[1].getValueType() &&
	"LHS and RHS of condition must have same type!");
	assert(Ops[2].getValueType() == Ops[3].getValueType() &&
	"True and False arms of SelectCC must have same type!");
	assert(Ops[2].getValueType() == VT &&
	"select_cc node must be of same type as true and false value!");
	break;
	case ISD::BR_CC:
	assert(NumOps == 5 && "BR_CC takes 5 operands!");
	assert(Ops[2].getValueType() == Ops[3].getValueType() &&
	"LHS/RHS of comparison should match types!");
	break;
	}

	// Memoize nodes.
	SDNode *N;
	SDVTList VTs = getVTList(VT);

	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;

	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
	ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
	return getNode(Opcode, DL, getVTList(ResultTys), Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	ArrayRef<SDValue> Ops) {
	if (VTList.NumVTs == 1)
	return getNode(Opcode, DL, VTList.VTs[0], Ops);

	#if 0
	switch (Opcode) {
	// FIXME: figure out how to safely handle things like
	// int foo(int x) { return 1 << (x & 255); }
	// int bar() { return foo(256); }
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS:
	case ISD::SHL_PARTS:
	if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
	return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
	else if (N3.getOpcode() == ISD::AND)
	if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
	// If the and is only masking out bits that cannot effect the shift,
	// eliminate the and.
	unsigned NumBits = VT.getScalarSizeInBits()*2;
	if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
	return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
	}
	break;
	}
	#endif

	// Memoize the node unless it returns a flag.
	SDNode *N;
	if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
	createOperands(N, Ops);
	}
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
	SDVTList VTList) {
	return getNode(Opcode, DL, VTList, None);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1) {
	SDValue Ops[] = { N1 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2) {
	SDValue Ops[] = { N1, N2 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3) {
	SDValue Ops[] = { N1, N2, N3 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
	SDValue Ops[] = { N1, N2, N3, N4 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4,
	SDValue N5) {
	SDValue Ops[] = { N1, N2, N3, N4, N5 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDVTList SelectionDAG::getVTList(EVT VT) {
	return makeVTList(SDNode::getValueTypeList(VT), 1);
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
	FoldingSetNodeID ID;
	ID.AddInteger(2U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(2);
	Array[0] = VT1;
	Array[1] = VT2;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
	FoldingSetNodeID ID;
	ID.AddInteger(3U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());
	ID.AddInteger(VT3.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(3);
	Array[0] = VT1;
	Array[1] = VT2;
	Array[2] = VT3;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
	FoldingSetNodeID ID;
	ID.AddInteger(4U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());
	ID.AddInteger(VT3.getRawBits());
	ID.AddInteger(VT4.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(4);
	Array[0] = VT1;
	Array[1] = VT2;
	Array[2] = VT3;
	Array[3] = VT4;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
	unsigned NumVTs = VTs.size();
	FoldingSetNodeID ID;
	ID.AddInteger(NumVTs);
	for (unsigned index = 0; index < NumVTs; index++) {
	ID.AddInteger(VTs[index].getRawBits());
	}

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(NumVTs);
	std::copy(VTs.begin(), VTs.end(), Array);
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}


	/// UpdateNodeOperands - Mutate the specified node in-place to have the
	/// specified operands. If the resultant node already exists in the DAG,
	/// this does not modify the specified node, instead it returns the node that
	/// already exists. If the resultant node does not exist in the DAG, the
	/// input node is returned. As a degenerate case, if you specify the same
	/// input operands as the node already has, the input node is returned.
	SDNode SelectionDAG::UpdateNodeOperands(SDNode N, SDValue Op) {
	assert(N->getNumOperands() == 1 && "Update with wrong number of operands");

	// Check to see if there is no change.
	if (Op == N->getOperand(0)) return N;

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	N->OperandList[0].set(Op);

	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	SDNode SelectionDAG::UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2) {
	assert(N->getNumOperands() == 2 && "Update with wrong number of operands");

	// Check to see if there is no change.
	if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
	return N; // No operands changed, just return the input node.

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	if (N->OperandList[0] != Op1)
	N->OperandList[0].set(Op1);
	if (N->OperandList[1] != Op2)
	N->OperandList[1].set(Op2);

	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
	SDValue Ops[] = { Op1, Op2, Op3 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4) {
	SDValue Ops[] = { Op1, Op2, Op3, Op4 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4, SDValue Op5) {
	SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
	unsigned NumOps = Ops.size();
	assert(N->getNumOperands() == NumOps &&
	"Update with wrong number of operands");

	// If no operands changed just return the input node.
	if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
	return N;

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	for (unsigned i = 0; i != NumOps; ++i)
	if (N->OperandList[i] != Ops[i])
	N->OperandList[i].set(Ops[i]);

	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	/// DropOperands - Release the operands and set this node to have
	/// zero operands.
	void SDNode::DropOperands() {
	// Unlike the code in MorphNodeTo that does this, we don't need to
	// watch for dead nodes here.
	for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
	SDUse &Use = *I++;
	Use.set(SDValue());
	}
	}

	/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
	/// machine opcode.
	///
	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT) {
	SDVTList VTs = getVTList(VT);
	return SelectNodeTo(N, MachineOpc, VTs, None);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1,
	SDValue Op2) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1,
	SDValue Op2, SDValue Op3) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2) {
	SDVTList VTs = getVTList(VT1, VT2);
	return SelectNodeTo(N, MachineOpc, VTs, None);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2, EVT VT3,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2,
	SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	SDVTList VTs,ArrayRef<SDValue> Ops) {
	SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
	// Reset the NodeID to -1.
	New->setNodeId(-1);
	if (New != N) {
	ReplaceAllUsesWith(N, New);
	RemoveDeadNode(N);
	}
	return New;
	}

	/// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
	/// the line number information on the merged node since it is not possible to
	/// preserve the information that operation is associated with multiple lines.
	/// This will make the debugger working better at -O0, were there is a higher
	/// probability having other instructions associated with that line.
	///
	/// For IROrder, we keep the smaller of the two
	SDNode SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode N, const SDLoc &OLoc) {
	DebugLoc NLoc = N->getDebugLoc();
	if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
	N->setDebugLoc(DebugLoc());
	}
	unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
	N->setIROrder(Order);
	return N;
	}

	/// MorphNodeTo - This mutates the specified node to have the specified
	/// return type, opcode, and operands.
	///
	/// Note that MorphNodeTo returns the resultant node. If there is already a
	/// node of the specified opcode and operands, it returns that node instead of
	/// the current one. Note that the SDLoc need not be the same.
	///
	/// Using MorphNodeTo is faster than creating a new node and swapping it in
	/// with ReplaceAllUsesWith both because it often avoids allocating a new
	/// node, and because it doesn't require CSE recalculation for any of
	/// the node's users.
	///
	/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
	/// As a consequence it isn't appropriate to use from within the DAG combiner or
	/// the legalizer which maintain worklists that would need to be updated when
	/// deleting things.
	SDNode SelectionDAG::MorphNodeTo(SDNode N, unsigned Opc,
	SDVTList VTs, ArrayRef<SDValue> Ops) {
	// If an identical node already exists, use it.
	void *IP = nullptr;
	if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, VTs, Ops);
	if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
	return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
	}

	if (!RemoveNodeFromCSEMaps(N))
	IP = nullptr;

	// Start the morphing.
	N->NodeType = Opc;
	N->ValueList = VTs.VTs;
	N->NumValues = VTs.NumVTs;

	// Clear the operands list, updating used nodes to remove this from their
	// use list. Keep track of any operands that become dead as a result.
	SmallPtrSet<SDNode*, 16> DeadNodeSet;
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
	SDUse &Use = *I++;
	SDNode *Used = Use.getNode();
	Use.set(SDValue());
	if (Used->use_empty())
	DeadNodeSet.insert(Used);
	}

	// For MachineNode, initialize the memory references information.
	if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
	MN->setMemRefs(nullptr, nullptr);

	// Swap for an appropriately sized array from the recycler.
	removeOperands(N);
	createOperands(N, Ops);

	// Delete any nodes that are still dead after adding the uses for the
	// new operands.
	if (!DeadNodeSet.empty()) {
	SmallVector<SDNode *, 16> DeadNodes;
	for (SDNode *N : DeadNodeSet)
	if (N->use_empty())
	DeadNodes.push_back(N);
	RemoveDeadNodes(DeadNodes);
	}

	if (IP)
	CSEMap.InsertNode(N, IP); // Memoize the new node.
	return N;
	}

	SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
	unsigned OrigOpc = Node->getOpcode();
	unsigned NewOpc;
	bool IsUnary = false;
	bool IsTernary = false;
	switch (OrigOpc) {
	default:
	llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
	case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
	case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
	case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
	case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
	case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
	case ISD::STRICT_FMA: NewOpc = ISD::FMA; IsTernary = true; break;
	case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
	case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
	case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
	case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break;
	case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break;
	case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break;
	case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break;
	case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break;
	case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break;
	case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break;
	case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break;
	case ISD::STRICT_FNEARBYINT:
	NewOpc = ISD::FNEARBYINT;
	IsUnary = true;
	break;
	}

	// We're taking this node out of the chain, so we need to re-link things.
	SDValue InputChain = Node->getOperand(0);
	SDValue OutputChain = SDValue(Node, 1);
	ReplaceAllUsesOfValueWith(OutputChain, InputChain);

	SDVTList VTs = getVTList(Node->getOperand(1).getValueType());
	SDNode *Res = nullptr;
	if (IsUnary)
	Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
	else if (IsTernary)
	Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
	Node->getOperand(2),
	Node->getOperand(3)});
	else
	Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
	Node->getOperand(2) });

	// MorphNodeTo can operate in two ways: if an existing node with the
	// specified operands exists, it can just return it. Otherwise, it
	// updates the node in place to have the requested operands.
	if (Res == Node) {
	// If we updated the node in place, reset the node ID. To the isel,
	// this should be just like a newly allocated machine node.
	Res->setNodeId(-1);
	} else {
	ReplaceAllUsesWith(Node, Res);
	RemoveDeadNode(Node);
	}

	return Res;
	}

	/// getMachineNode - These are used for target selectors to create a new node
	/// with specified return type(s), MachineInstr opcode, and operands.
	///
	/// Note that getMachineNode returns the resultant node. If there is already a
	/// node of the specified opcode and operands, it returns that node instead of
	/// the current one.
	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT) {
	SDVTList VTs = getVTList(VT);
	return getMachineNode(Opcode, dl, VTs, None);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1, SDValue Op2,
	SDValue Op3) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, SDValue Op1,
	SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, SDValue Op1,
	SDValue Op2, SDValue Op3) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	SDValue Op1, SDValue Op2,
	SDValue Op3) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	ArrayRef<EVT> ResultTys,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(ResultTys);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
	SDVTList VTs,
	ArrayRef<SDValue> Ops) {
	bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
	MachineSDNode *N;
	void *IP = nullptr;

	if (DoCSE) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ~Opcode, VTs, Ops);
	IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
	}
	}

	// Allocate a new MachineSDNode.
	N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);

	if (DoCSE)
	CSEMap.InsertNode(N, IP);

	InsertNode(N);
	return N;
	}

	/// getTargetExtractSubreg - A convenience function for creating
	/// TargetOpcode::EXTRACT_SUBREG nodes.
	SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand) {
	SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
	SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
	VT, Operand, SRIdxVal);
	return SDValue(Subreg, 0);
	}

	/// getTargetInsertSubreg - A convenience function for creating
	/// TargetOpcode::INSERT_SUBREG nodes.
	SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand, SDValue Subreg) {
	SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
	SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
	VT, Operand, Subreg, SRIdxVal);
	return SDValue(Result, 0);
	}

	/// getNodeIfExists - Get the specified node if it's already available, or
	/// else return NULL.
	SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags) {
	if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
	E->intersectFlagsWith(Flags);
	return E;
	}
	}
	return nullptr;
	}

	/// getDbgValue - Creates a SDDbgValue node.
	///
	/// SDNode
	SDDbgValue SelectionDAG::getDbgValue(DIVariable Var, DIExpression *Expr,
	SDNode *N, unsigned R, bool IsIndirect,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc())
	SDDbgValue(Var, Expr, N, R, IsIndirect, DL, O);
	}

	/// Constant
	SDDbgValue SelectionDAG::getConstantDbgValue(DIVariable Var,
	DIExpression *Expr,
	const Value *C,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, DL, O);
	}

	/// FrameIndex
	SDDbgValue SelectionDAG::getFrameIndexDbgValue(DIVariable Var,
	DIExpression *Expr, unsigned FI,
	const DebugLoc &DL,
	unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, FI, DL, O);
	}

	void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
	unsigned OffsetInBits, unsigned SizeInBits,
	bool InvalidateDbg) {
	SDNode *FromNode = From.getNode();
	SDNode *ToNode = To.getNode();
	assert(FromNode && ToNode && "Can't modify dbg values");

	// PR35338
	// TODO: assert(From != To && "Redundant dbg value transfer");
	// TODO: assert(FromNode != ToNode && "Intranode dbg value transfer");
	if (From == To \|\| FromNode == ToNode)
	return;

	if (!FromNode->getHasDebugValue())
	return;

	SmallVector<SDDbgValue *, 2> ClonedDVs;
	for (SDDbgValue *Dbg : GetDbgValues(FromNode)) {
	if (Dbg->getKind() != SDDbgValue::SDNODE \|\| Dbg->isInvalidated())
	continue;

	// TODO: assert(!Dbg->isInvalidated() && "Transfer of invalid dbg value");

	// Just transfer the dbg value attached to From.
	if (Dbg->getResNo() != From.getResNo())
	continue;

	DIVariable *Var = Dbg->getVariable();
	auto *Expr = Dbg->getExpression();
	// If a fragment is requested, update the expression.
	if (SizeInBits) {
	// When splitting a larger (e.g., sign-extended) value whose
	// lower bits are described with an SDDbgValue, do not attempt
	// to transfer the SDDbgValue to the upper bits.
	if (auto FI = Expr->getFragmentInfo())
	if (OffsetInBits + SizeInBits > FI->SizeInBits)
	continue;
	auto Fragment = DIExpression::createFragmentExpression(Expr, OffsetInBits,
	SizeInBits);
	if (!Fragment)
	continue;
	Expr = *Fragment;
	}
	// Clone the SDDbgValue and move it to To.
	SDDbgValue *Clone =
	getDbgValue(Var, Expr, ToNode, To.getResNo(), Dbg->isIndirect(),
	Dbg->getDebugLoc(), Dbg->getOrder());
	ClonedDVs.push_back(Clone);

	if (InvalidateDbg)
	Dbg->setIsInvalidated();
	}

	for (SDDbgValue *Dbg : ClonedDVs)
	AddDbgValue(Dbg, ToNode, false);
	}

	void SelectionDAG::salvageDebugInfo(SDNode &N) {
	if (!N.getHasDebugValue())
	return;

	SmallVector<SDDbgValue *, 2> ClonedDVs;
	for (auto DV : GetDbgValues(&N)) {
	if (DV->isInvalidated())
	continue;
	switch (N.getOpcode()) {
	default:
	break;
	case ISD::ADD:
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	if (!isConstantIntBuildVectorOrConstantInt(N0) &&
	isConstantIntBuildVectorOrConstantInt(N1)) {
	uint64_t Offset = N.getConstantOperandVal(1);
	// Rewrite an ADD constant node into a DIExpression. Since we are
	// performing arithmetic to compute the variable's value in the
	// DIExpression, we need to mark the expression with a
	// DW_OP_stack_value.
	auto *DIExpr = DV->getExpression();
	DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
	DIExpression::NoDeref,
	DIExpression::WithStackValue);
	SDDbgValue *Clone =
	getDbgValue(DV->getVariable(), DIExpr, N0.getNode(), N0.getResNo(),
	DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
	ClonedDVs.push_back(Clone);
	DV->setIsInvalidated();
	DEBUG(dbgs() << "SALVAGE: Rewriting"; N0.getNode()->dumprFull(this);
	dbgs() << " into " << *DIExpr << '\n');
	}
	}
	}

	for (SDDbgValue *Dbg : ClonedDVs)
	AddDbgValue(Dbg, Dbg->getSDNode(), false);
	}

	namespace {

	/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
	/// pointed to by a use iterator is deleted, increment the use iterator
	/// so that it doesn't dangle.
	///
	class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
	SDNode::use_iterator &UI;
	SDNode::use_iterator &UE;

	void NodeDeleted(SDNode N, SDNode E) override {
	// Increment the iterator as needed.
	while (UI != UE && N == *UI)
	++UI;
	}

	public:
	RAUWUpdateListener(SelectionDAG &d,
	SDNode::use_iterator &ui,
	SDNode::use_iterator &ue)
	: SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
	};

	} // end anonymous namespace

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version assumes From has a single result value.
	///
	void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
	SDNode *From = FromN.getNode();
	assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
	"Cannot replace with this method!");
	assert(From != To.getNode() && "Cannot replace uses of with self");

	// Preserve Debug Values
	transferDbgValues(FromN, To);

	// Iterate over all the existing uses of From. New uses will be added
	// to the beginning of the use list, which we avoid visiting.
	// This specifically avoids visiting uses of From that arise while the
	// replacement is happening, because any such uses would be the result
	// of CSE: If an existing node looks like From after one of its operands
	// is replaced by To, we don't want to replace of all its users with To
	// too. See PR3018 for more info.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	++UI;
	Use.set(To);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (FromN == getRoot())
	setRoot(To);
	}

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version assumes that for each value of From, there is a
	/// corresponding value in To in the same position with the same type.
	///
	void SelectionDAG::ReplaceAllUsesWith(SDNode From, SDNode To) {
	#ifndef NDEBUG
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	assert((!From->hasAnyUseOfValue(i) \|\|
	From->getValueType(i) == To->getValueType(i)) &&
	"Cannot use this version of ReplaceAllUsesWith!");
	#endif

	// Handle the trivial case.
	if (From == To)
	return;

	// Preserve Debug Info. Only do this if there's a use.
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	if (From->hasAnyUseOfValue(i)) {
	assert((i < To->getNumValues()) && "Invalid To location");
	transferDbgValues(SDValue(From, i), SDValue(To, i));
	}

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	++UI;
	Use.setNode(To);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot().getNode())
	setRoot(SDValue(To, getRoot().getResNo()));
	}

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version can replace From with any result values. To must match the
	/// number and types of values returned by From.
	void SelectionDAG::ReplaceAllUsesWith(SDNode From, const SDValue To) {
	if (From->getNumValues() == 1) // Handle the simple case efficiently.
	return ReplaceAllUsesWith(SDValue(From, 0), To[0]);

	// Preserve Debug Info.
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	transferDbgValues(SDValue(From, i), *To);

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	const SDValue &ToOp = To[Use.getResNo()];
	++UI;
	Use.set(ToOp);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot().getNode())
	setRoot(SDValue(To[getRoot().getResNo()]));
	}

	/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone. The Deleted
	/// vector is handled the same way as for ReplaceAllUsesWith.
	void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
	// Handle the really simple, really trivial case efficiently.
	if (From == To) return;

	// Handle the simple, trivial, case efficiently.
	if (From.getNode()->getNumValues() == 1) {
	ReplaceAllUsesWith(From, To);
	return;
	}

	// Preserve Debug Info.
	transferDbgValues(From, To);

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From.getNode()->use_begin(),
	UE = From.getNode()->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;
	bool UserRemovedFromCSEMaps = false;

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();

	// Skip uses of different values from the same node.
	if (Use.getResNo() != From.getResNo()) {
	++UI;
	continue;
	}

	// If this node hasn't been modified yet, it's still in the CSE maps,
	// so remove its old self from the CSE maps.
	if (!UserRemovedFromCSEMaps) {
	RemoveNodeFromCSEMaps(User);
	UserRemovedFromCSEMaps = true;
	}

	++UI;
	Use.set(To);
	} while (UI != UE && *UI == User);

	// We are iterating over all uses of the From node, so if a use
	// doesn't use the specific value, no changes are made.
	if (!UserRemovedFromCSEMaps)
	continue;

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot())
	setRoot(To);
	}

	namespace {

	/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
	/// to record information about a use.
	struct UseMemo {
	SDNode *User;
	unsigned Index;
	SDUse *Use;
	};

	/// operator< - Sort Memos by User.
	bool operator<(const UseMemo &L, const UseMemo &R) {
	return (intptr_t)L.User < (intptr_t)R.User;
	}

	} // end anonymous namespace

	/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone. The same value
	/// may appear in both the From and To list. The Deleted vector is
	/// handled the same way as for ReplaceAllUsesWith.
	void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
	const SDValue *To,
	unsigned Num){
	// Handle the simple, trivial case efficiently.
	if (Num == 1)
	return ReplaceAllUsesOfValueWith(From, To);

	transferDbgValues(From, To);

	// Read up all the uses and make records of them. This helps
	// processing new uses that are introduced during the
	// replacement process.
	SmallVector<UseMemo, 4> Uses;
	for (unsigned i = 0; i != Num; ++i) {
	unsigned FromResNo = From[i].getResNo();
	SDNode *FromNode = From[i].getNode();
	for (SDNode::use_iterator UI = FromNode->use_begin(),
	E = FromNode->use_end(); UI != E; ++UI) {
	SDUse &Use = UI.getUse();
	if (Use.getResNo() == FromResNo) {
	UseMemo Memo = { *UI, i, &Use };
	Uses.push_back(Memo);
	}
	}
	}

	// Sort the uses, so that all the uses from a given User are together.
	std::sort(Uses.begin(), Uses.end());

	for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
	UseIndex != UseIndexEnd; ) {
	// We know that this user uses some value of From. If it is the right
	// value, update it.
	SDNode *User = Uses[UseIndex].User;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// The Uses array is sorted, so all the uses for a given User
	// are next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	unsigned i = Uses[UseIndex].Index;
	SDUse &Use = *Uses[UseIndex].Use;
	++UseIndex;

	Use.set(To[i]);
	} while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}
	}

	/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
	/// based on their topological order. It returns the maximum id and a vector
	/// of the SDNodes* in assigned order by reference.
	unsigned SelectionDAG::AssignTopologicalOrder() {
	unsigned DAGSize = 0;

	// SortedPos tracks the progress of the algorithm. Nodes before it are
	// sorted, nodes after it are unsorted. When the algorithm completes
	// it is at the end of the list.
	allnodes_iterator SortedPos = allnodes_begin();

	// Visit all the nodes. Move nodes with no operands to the front of
	// the list immediately. Annotate nodes that do have operands with their
	// operand count. Before we do this, the Node Id fields of the nodes
	// may contain arbitrary values. After, the Node Id fields for nodes
	// before SortedPos will contain the topological sort index, and the
	// Node Id fields for nodes At SortedPos and after will contain the
	// count of outstanding operands.
	for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
	SDNode N = &I++;
	checkForCycles(N, this);
	unsigned Degree = N->getNumOperands();
	if (Degree == 0) {
	// A node with no uses, add it to the result array immediately.
	N->setNodeId(DAGSize++);
	allnodes_iterator Q(N);
	if (Q != SortedPos)
	SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
	assert(SortedPos != AllNodes.end() && "Overran node list");
	++SortedPos;
	} else {
	// Temporarily use the Node Id as scratch space for the degree count.
	N->setNodeId(Degree);
	}
	}

	// Visit all the nodes. As we iterate, move nodes into sorted order,
	// such that by the time the end is reached all nodes will be sorted.
	for (SDNode &Node : allnodes()) {
	SDNode *N = &Node;
	checkForCycles(N, this);
	// N is in sorted position, so all its uses have one less operand
	// that needs to be sorted.
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	SDNode P = UI;
	unsigned Degree = P->getNodeId();
	assert(Degree != 0 && "Invalid node degree");
	--Degree;
	if (Degree == 0) {
	// All of P's operands are sorted, so P may sorted now.
	P->setNodeId(DAGSize++);
	if (P->getIterator() != SortedPos)
	SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
	assert(SortedPos != AllNodes.end() && "Overran node list");
	++SortedPos;
	} else {
	// Update P's outstanding operand count.
	P->setNodeId(Degree);
	}
	}
	if (Node.getIterator() == SortedPos) {
	#ifndef NDEBUG
	allnodes_iterator I(N);
	SDNode S = &++I;
	dbgs() << "Overran sorted position:\n";
	S->dumprFull(this); dbgs() << "\n";
	dbgs() << "Checking if this is due to cycles\n";
	checkForCycles(this, true);
	#endif
	llvm_unreachable(nullptr);
	}
	}

	assert(SortedPos == AllNodes.end() &&
	"Topological sort incomplete!");
	assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
	"First node in topological sort is not the entry token!");
	assert(AllNodes.front().getNodeId() == 0 &&
	"First node in topological sort has non-zero id!");
	assert(AllNodes.front().getNumOperands() == 0 &&
	"First node in topological sort has operands!");
	assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
	"Last node in topologic sort has unexpected id!");
	assert(AllNodes.back().use_empty() &&
	"Last node in topologic sort has users!");
	assert(DAGSize == allnodes_size() && "Node count mismatch!");
	return DAGSize;
	}

	/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
	/// value is produced by SD.
	void SelectionDAG::AddDbgValue(SDDbgValue DB, SDNode SD, bool isParameter) {
	if (SD) {
	assert(DbgInfo->getSDDbgValues(SD).empty() \|\| SD->getHasDebugValue());
	SD->setHasDebugValue(true);
	}
	DbgInfo->add(DB, SD, isParameter);
	}

	SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
	SDValue NewMemOp) {
	assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
	// The new memory operation must have the same position as the old load in
	// terms of memory dependency. Create a TokenFactor for the old load and new
	// memory operation and update uses of the old load's output chain to use that
	// TokenFactor.
	SDValue OldChain = SDValue(OldLoad, 1);
	SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
	if (!OldLoad->hasAnyUseOfValue(1))
	return NewChain;

	SDValue TokenFactor =
	getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
	ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
	UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
	return TokenFactor;
	}

	//===----------------------------------------------------------------------===//
	// SDNode Class
	//===----------------------------------------------------------------------===//

	bool llvm::isNullConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isNullValue();
	}

	bool llvm::isNullFPConstant(SDValue V) {
	ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
	return Const != nullptr && Const->isZero() && !Const->isNegative();
	}

	bool llvm::isAllOnesConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isAllOnesValue();
	}

	bool llvm::isOneConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isOne();
	}

	bool llvm::isBitwiseNot(SDValue V) {
	return V.getOpcode() == ISD::XOR && isAllOnesConstant(V.getOperand(1));
	}

	ConstantSDNode *llvm::isConstOrConstSplat(SDValue N) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);

	// BuildVectors can truncate their operands. Ignore that case here.
	// FIXME: We blindly ignore splats which include undef which is overly
	// pessimistic.
	if (CN && UndefElements.none() &&
	CN->getValueType(0) == N.getValueType().getScalarType())
	return CN;
	}

	return nullptr;
	}

	ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N) {
	if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);

	if (CN && UndefElements.none())
	return CN;
	}

	return nullptr;
	}

	HandleSDNode::~HandleSDNode() {
	DropOperands();
	}

	GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
	const DebugLoc &DL,
	const GlobalValue *GA, EVT VT,
	int64_t o, unsigned char TF)
	: SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
	TheGlobal = GA;
	}

	AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
	EVT VT, unsigned SrcAS,
	unsigned DestAS)
	: SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
	SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}

	MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
	: SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
	MemSDNodeBits.IsVolatile = MMO->isVolatile();
	MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
	MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
	MemSDNodeBits.IsInvariant = MMO->isInvariant();

	// We check here that the size of the memory operand fits within the size of
	// the MMO. This is because the MMO might indicate only a possible address
	// range instead of specifying the affected memory addresses precisely.
	assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
	}

	/// Profile - Gather unique data for the node.
	///
	void SDNode::Profile(FoldingSetNodeID &ID) const {
	AddNodeIDNode(ID, this);
	}

	namespace {

	struct EVTArray {
	std::vector<EVT> VTs;

	EVTArray() {
	VTs.reserve(MVT::LAST_VALUETYPE);
	for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
	VTs.push_back(MVT((MVT::SimpleValueType)i));
	}
	};

	} // end anonymous namespace

	static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
	static ManagedStatic<EVTArray> SimpleVTArray;
	static ManagedStatic<sys::SmartMutex<true>> VTMutex;

	/// getValueTypeList - Return a pointer to the specified value type.
	///
	const EVT *SDNode::getValueTypeList(EVT VT) {
	if (VT.isExtended()) {
	sys::SmartScopedLock<true> Lock(*VTMutex);
	return &(*EVTs->insert(VT).first);
	} else {
	assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
	"Value type out of range!");
	return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
	}
	}

	/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
	/// indicated value. This method ignores uses of other values defined by this
	/// operation.
	bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
	assert(Value < getNumValues() && "Bad value!");

	// TODO: Only iterate over uses of a given value of the node
	for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
	if (UI.getUse().getResNo() == Value) {
	if (NUses == 0)
	return false;
	--NUses;
	}
	}

	// Found exactly the right number of uses?
	return NUses == 0;
	}

	/// hasAnyUseOfValue - Return true if there are any use of the indicated
	/// value. This method ignores uses of other values defined by this operation.
	bool SDNode::hasAnyUseOfValue(unsigned Value) const {
	assert(Value < getNumValues() && "Bad value!");

	for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
	if (UI.getUse().getResNo() == Value)
	return true;

	return false;
	}

	/// isOnlyUserOf - Return true if this node is the only use of N.
	bool SDNode::isOnlyUserOf(const SDNode *N) const {
	bool Seen = false;
	for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
	SDNode User = I;
	if (User == this)
	Seen = true;
	else
	return false;
	}

	return Seen;
	}

	/// Return true if the only users of N are contained in Nodes.
	bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode > Nodes, const SDNode N) {
	bool Seen = false;
	for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
	SDNode User = I;
	if (llvm::any_of(Nodes,
	[&User](const SDNode *Node) { return User == Node; }))
	Seen = true;
	else
	return false;
	}

	return Seen;
	}

	/// isOperand - Return true if this node is an operand of N.
	bool SDValue::isOperandOf(const SDNode *N) const {
	for (const SDValue &Op : N->op_values())
	if (*this == Op)
	return true;
	return false;
	}

	bool SDNode::isOperandOf(const SDNode *N) const {
	for (const SDValue &Op : N->op_values())
	if (this == Op.getNode())
	return true;
	return false;
	}

	/// reachesChainWithoutSideEffects - Return true if this operand (which must
	/// be a chain) reaches the specified operand without crossing any
	/// side-effecting instructions on any chain path. In practice, this looks
	/// through token factors and non-volatile loads. In order to remain efficient,
	/// this only looks a couple of nodes in, it does not do an exhaustive search.
	///
	/// Note that we only need to examine chains when we're searching for
	/// side-effects; SelectionDAG requires that all side-effects are represented
	/// by chains, even if another operand would force a specific ordering. This
	/// constraint is necessary to allow transformations like splitting loads.
	bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
	unsigned Depth) const {
	if (*this == Dest) return true;

	// Don't search too deeply, we just want to be able to see through
	// TokenFactor's etc.
	if (Depth == 0) return false;

	// If this is a token factor, all inputs to the TF happen in parallel.
	if (getOpcode() == ISD::TokenFactor) {
	// First, try a shallow search.
	if (is_contained((*this)->ops(), Dest)) {
	// We found the chain we want as an operand of this TokenFactor.
	// Essentially, we reach the chain without side-effects if we could
	// serialize the TokenFactor into a simple chain of operations with
	// Dest as the last operation. This is automatically true if the
	// chain has one use: there are no other ordering constraints.
	// If the chain has more than one use, we give up: some other
	// use of Dest might force a side-effect between Dest and the current
	// node.
	if (Dest.hasOneUse())
	return true;
	}
	// Next, try a deep search: check whether every operand of the TokenFactor
	// reaches Dest.
	return llvm::all_of((*this)->ops(), [=](SDValue Op) {
	return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
	});
	}

	// Loads don't have side effects, look through them.
	if (LoadSDNode Ld = dyn_cast<LoadSDNode>(this)) {
	if (!Ld->isVolatile())
	return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
	}
	return false;
	}

	bool SDNode::hasPredecessor(const SDNode *N) const {
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(this);
	return hasPredecessorHelper(N, Visited, Worklist);
	}

	void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
	this->Flags.intersectWith(Flags);
	}

	SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
	assert(N->getNumValues() == 1 &&
	"Can't unroll a vector with multiple results!");

	EVT VT = N->getValueType(0);
	unsigned NE = VT.getVectorNumElements();
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);

	SmallVector<SDValue, 8> Scalars;
	SmallVector<SDValue, 4> Operands(N->getNumOperands());

	// If ResNE is 0, fully unroll the vector op.
	if (ResNE == 0)
	ResNE = NE;
	else if (NE > ResNE)
	NE = ResNE;

	unsigned i;
	for (i= 0; i != NE; ++i) {
	for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
	SDValue Operand = N->getOperand(j);
	EVT OperandVT = Operand.getValueType();
	if (OperandVT.isVector()) {
	// A vector operand; extract a single element.
	EVT OperandEltVT = OperandVT.getVectorElementType();
	Operands[j] =
	getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
	getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
	} else {
	// A scalar operand; just use it as is.
	Operands[j] = Operand;
	}
	}

	switch (N->getOpcode()) {
	default: {
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
	N->getFlags()));
	break;
	}
	case ISD::VSELECT:
	Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::ROTL:
	case ISD::ROTR:
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
	getShiftAmountOperand(Operands[0].getValueType(),
	Operands[1])));
	break;
	case ISD::SIGN_EXTEND_INREG:
	case ISD::FP_ROUND_INREG: {
	EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
	Operands[0],
	getValueType(ExtVT)));
	}
	}
	}

	for (; i < ResNE; ++i)
	Scalars.push_back(getUNDEF(EltVT));

	EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
	return getBuildVector(VecVT, dl, Scalars);
	}

	bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
	LoadSDNode *Base,
	unsigned Bytes,
	int Dist) const {
	if (LD->isVolatile() \|\| Base->isVolatile())
	return false;
	if (LD->isIndexed() \|\| Base->isIndexed())
	return false;
	if (LD->getChain() != Base->getChain())
	return false;
	EVT VT = LD->getValueType(0);
	if (VT.getSizeInBits() / 8 != Bytes)
	return false;

	- SDValue Loc = LD->getOperand(1);
	- SDValue BaseLoc = Base->getOperand(1);
	-
	- auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this);
	- auto LocDecomp = BaseIndexOffset::match(Loc, *this);
	+ auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
	+ auto LocDecomp = BaseIndexOffset::match(LD, *this);

	int64_t Offset = 0;
	if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
	return (Dist * Bytes == Offset);
	return false;
	}

	/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
	/// it cannot be inferred.
	unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
	// If this is a GlobalAddress + cst, return the alignment.
	const GlobalValue *GV;
	int64_t GVOffset = 0;
	if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
	unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
	KnownBits Known(PtrWidth);
	llvm::computeKnownBits(GV, Known, getDataLayout());
	unsigned AlignBits = Known.countMinTrailingZeros();
	unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
	if (Align)
	return MinAlign(Align, GVOffset);
	}

	// If this is a direct reference to a stack slot, use information about the
	// stack slot's alignment.
	int FrameIdx = 1 << 31;
	int64_t FrameOffset = 0;
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FrameIdx = FI->getIndex();
	} else if (isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	// Handle FI+Cst
	FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	FrameOffset = Ptr.getConstantOperandVal(1);
	}

	if (FrameIdx != (1 << 31)) {
	const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
	FrameOffset);
	return FIInfoAlign;
	}

	return 0;
	}

	/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
	/// which is split (or expanded) into two not necessarily identical pieces.
	std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
	// Currently all types are split in half.
	EVT LoVT, HiVT;
	if (!VT.isVector())
	LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
	else
	LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());

	return std::make_pair(LoVT, HiVT);
	}

	/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
	/// low/high part.
	std::pair<SDValue, SDValue>
	SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
	const EVT &HiVT) {
	assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
	N.getValueType().getVectorNumElements() &&
	"More vector elements requested than available!");
	SDValue Lo, Hi;
	Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
	getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
	Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
	getConstant(LoVT.getVectorNumElements(), DL,
	TLI->getVectorIdxTy(getDataLayout())));
	return std::make_pair(Lo, Hi);
	}

	void SelectionDAG::ExtractVectorElements(SDValue Op,
	SmallVectorImpl<SDValue> &Args,
	unsigned Start, unsigned Count) {
	EVT VT = Op.getValueType();
	if (Count == 0)
	Count = VT.getVectorNumElements();

	EVT EltVT = VT.getVectorElementType();
	EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
	SDLoc SL(Op);
	for (unsigned i = Start, e = Start + Count; i != e; ++i) {
	Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
	Op, getConstant(i, SL, IdxTy)));
	}
	}

	// getAddressSpace - Return the address space this GlobalAddress belongs to.
	unsigned GlobalAddressSDNode::getAddressSpace() const {
	return getGlobal()->getType()->getAddressSpace();
	}

	Type *ConstantPoolSDNode::getType() const {
	if (isMachineConstantPoolEntry())
	return Val.MachineCPVal->getType();
	return Val.ConstVal->getType();
	}

	bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
	unsigned &SplatBitSize,
	bool &HasAnyUndefs,
	unsigned MinSplatBits,
	bool IsBigEndian) const {
	EVT VT = getValueType(0);
	assert(VT.isVector() && "Expected a vector type");
	unsigned VecWidth = VT.getSizeInBits();
	if (MinSplatBits > VecWidth)
	return false;

	// FIXME: The widths are based on this node's type, but build vectors can
	// truncate their operands.
	SplatValue = APInt(VecWidth, 0);
	SplatUndef = APInt(VecWidth, 0);

	// Get the bits. Bits with undefined values (when the corresponding element
	// of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
	// in SplatValue. If any of the values are not constant, give up and return
	// false.
	unsigned int NumOps = getNumOperands();
	assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
	unsigned EltWidth = VT.getScalarSizeInBits();

	for (unsigned j = 0; j < NumOps; ++j) {
	unsigned i = IsBigEndian ? NumOps - 1 - j : j;
	SDValue OpVal = getOperand(i);
	unsigned BitPos = j * EltWidth;

	if (OpVal.isUndef())
	SplatUndef.setBits(BitPos, BitPos + EltWidth);
	else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
	SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
	else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
	SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
	else
	return false;
	}

	// The build_vector is all constants or undefs. Find the smallest element
	// size that splats the vector.
	HasAnyUndefs = (SplatUndef != 0);

	// FIXME: This does not work for vectors with elements less than 8 bits.
	while (VecWidth > 8) {
	unsigned HalfSize = VecWidth / 2;
	APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
	APInt LowValue = SplatValue.trunc(HalfSize);
	APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
	APInt LowUndef = SplatUndef.trunc(HalfSize);

	// If the two halves do not match (ignoring undef bits), stop here.
	if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) \|\|
	MinSplatBits > HalfSize)
	break;

	SplatValue = HighValue \| LowValue;
	SplatUndef = HighUndef & LowUndef;

	VecWidth = HalfSize;
	}

	SplatBitSize = VecWidth;
	return true;
	}

	SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
	if (UndefElements) {
	UndefElements->clear();
	UndefElements->resize(getNumOperands());
	}
	SDValue Splatted;
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	SDValue Op = getOperand(i);
	if (Op.isUndef()) {
	if (UndefElements)
	(*UndefElements)[i] = true;
	} else if (!Splatted) {
	Splatted = Op;
	} else if (Splatted != Op) {
	return SDValue();
	}
	}

	if (!Splatted) {
	assert(getOperand(0).isUndef() &&
	"Can only have a splat without a constant for all undefs.");
	return getOperand(0);
	}

	return Splatted;
	}

	ConstantSDNode *
	BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
	}

	ConstantFPSDNode *
	BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
	}

	int32_t
	BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
	uint32_t BitWidth) const {
	if (ConstantFPSDNode *CN =
	dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
	bool IsExact;
	APSInt IntVal(BitWidth);
	const APFloat &APF = CN->getValueAPF();
	if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
	APFloat::opOK \|\|
	!IsExact)
	return -1;

	return IntVal.exactLogBase2();
	}
	return -1;
	}

	bool BuildVectorSDNode::isConstant() const {
	for (const SDValue &Op : op_values()) {
	unsigned Opc = Op.getOpcode();
	if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
	return false;
	}
	return true;
	}

	bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
	// Find the first non-undef value in the shuffle mask.
	unsigned i, e;
	for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
	/* search */;

	assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");

	// Make sure all remaining elements are either undef or the same as the first
	// non-undef value.
	for (int Idx = Mask[i]; i != e; ++i)
	if (Mask[i] >= 0 && Mask[i] != Idx)
	return false;
	return true;
	}

	// \brief Returns the SDNode if it is a constant integer BuildVector
	// or constant integer.
	SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
	if (isa<ConstantSDNode>(N))
	return N.getNode();
	if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
	return N.getNode();
	// Treat a GlobalAddress supporting constant offset folding as a
	// constant integer.
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
	if (GA->getOpcode() == ISD::GlobalAddress &&
	TLI->isOffsetFoldingLegal(GA))
	return GA;
	return nullptr;
	}

	SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
	if (isa<ConstantFPSDNode>(N))
	return N.getNode();

	if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
	return N.getNode();

	return nullptr;
	}

	#ifndef NDEBUG
	static void checkForCyclesHelper(const SDNode *N,
	SmallPtrSetImpl<const SDNode*> &Visited,
	SmallPtrSetImpl<const SDNode*> &Checked,
	const llvm::SelectionDAG *DAG) {
	// If this node has already been checked, don't check it again.
	if (Checked.count(N))
	return;

	// If a node has already been visited on this depth-first walk, reject it as
	// a cycle.
	if (!Visited.insert(N).second) {
	errs() << "Detected cycle in SelectionDAG\n";
	dbgs() << "Offending node:\n";
	N->dumprFull(DAG); dbgs() << "\n";
	abort();
	}

	for (const SDValue &Op : N->op_values())
	checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);

	Checked.insert(N);
	Visited.erase(N);
	}
	#endif

	void llvm::checkForCycles(const llvm::SDNode *N,
	const llvm::SelectionDAG *DAG,
	bool force) {
	#ifndef NDEBUG
	bool check = force;
	#ifdef EXPENSIVE_CHECKS
	check = true;
	#endif // EXPENSIVE_CHECKS
	if (check) {
	assert(N && "Checking nonexistent SDNode");
	SmallPtrSet<const SDNode*, 32> visited;
	SmallPtrSet<const SDNode*, 32> checked;
	checkForCyclesHelper(N, visited, checked, DAG);
	}
	#endif // !NDEBUG
	}

	void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
	checkForCycles(DAG->getRoot().getNode(), DAG, force);
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp (revision 328362)
	@@ -1,133 +1,152 @@
	//==- llvm/CodeGen/SelectionDAGAddressAnalysis.cpp - DAG Address Analysis --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/Support/Casting.h"
	#include <cstdint>

	using namespace llvm;

	bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
	const SelectionDAG &DAG, int64_t &Off) {
	+ // Conservatively fail if we a match failed..
	+ if (!Base.getNode() \|\| !Other.Base.getNode())
	+ return false;
	// Initial Offset difference.
	Off = Other.Offset - Offset;

	if ((Other.Index == Index) && (Other.IsIndexSignExt == IsIndexSignExt)) {
	// Trivial match.
	if (Other.Base == Base)
	return true;

	// Match GlobalAddresses
	if (auto *A = dyn_cast<GlobalAddressSDNode>(Base))
	if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
	if (A->getGlobal() == B->getGlobal()) {
	Off += B->getOffset() - A->getOffset();
	return true;
	}

	// Match Constants
	if (auto *A = dyn_cast<ConstantPoolSDNode>(Base))
	if (auto *B = dyn_cast<ConstantPoolSDNode>(Other.Base)) {
	bool IsMatch =
	A->isMachineConstantPoolEntry() == B->isMachineConstantPoolEntry();
	if (IsMatch) {
	if (A->isMachineConstantPoolEntry())
	IsMatch = A->getMachineCPVal() == B->getMachineCPVal();
	else
	IsMatch = A->getConstVal() == B->getConstVal();
	}
	if (IsMatch) {
	Off += B->getOffset() - A->getOffset();
	return true;
	}
	}

	const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

	// Match non-equal FrameIndexes - If both frame indices are fixed
	// we know their relative offsets and can compare them. Otherwise
	// we must be conservative.
	if (auto *A = dyn_cast<FrameIndexSDNode>(Base))
	if (auto *B = dyn_cast<FrameIndexSDNode>(Other.Base))
	if (MFI.isFixedObjectIndex(A->getIndex()) &&
	MFI.isFixedObjectIndex(B->getIndex())) {
	Off += MFI.getObjectOffset(B->getIndex()) -
	MFI.getObjectOffset(A->getIndex());
	return true;
	}
	}
	return false;
	}

	/// Parses tree in Ptr for base, index, offset addresses.
	-BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
	+BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N,
	+ const SelectionDAG &DAG) {
	+ SDValue Ptr = N->getBasePtr();
	+
	// (((B + I*M) + c)) + c ...
	SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
	SDValue Index = SDValue();
	int64_t Offset = 0;
	bool IsIndexSignExt = false;
	+
	+ // pre-inc/pre-dec ops are components of EA.
	+ if (N->getAddressingMode() == ISD::PRE_INC) {
	+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
	+ Offset += C->getSExtValue();
	+ else // If unknown, give up now.
	+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
	+ } else if (N->getAddressingMode() == ISD::PRE_DEC) {
	+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
	+ Offset -= C->getSExtValue();
	+ else // If unknown, give up now.
	+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
	+ }

	// Consume constant adds & ors with appropriate masking.
	while (Base->getOpcode() == ISD::ADD \|\| Base->getOpcode() == ISD::OR) {
	if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
	// Only consider ORs which act as adds.
	if (Base->getOpcode() == ISD::OR &&
	!DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue()))
	break;
	Offset += C->getSExtValue();
	Base = Base->getOperand(0);
	continue;
	}
	break;
	}

	if (Base->getOpcode() == ISD::ADD) {
	// TODO: The following code appears to be needless as it just
	// bails on some Ptrs early, reducing the cases where we
	// find equivalence. We should be able to remove this.
	// Inside a loop the current BASE pointer is calculated using an ADD and a
	// MUL instruction. In this case Base is the actual BASE pointer.
	// (i64 add (i64 %array_ptr)
	// (i64 mul (i64 %induction_var)
	// (i64 %element_size)))
	if (Base->getOperand(1)->getOpcode() == ISD::MUL)
	return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);

	// Look at Base + Index + Offset cases.
	Index = Base->getOperand(1);
	SDValue PotentialBase = Base->getOperand(0);

	// Skip signextends.
	if (Index->getOpcode() == ISD::SIGN_EXTEND) {
	Index = Index->getOperand(0);
	IsIndexSignExt = true;
	}

	// Check if Index Offset pattern
	if (Index->getOpcode() != ISD::ADD \|\|
	!isa<ConstantSDNode>(Index->getOperand(1)))
	return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt);

	Offset += cast<ConstantSDNode>(Index->getOperand(1))->getSExtValue();
	Index = Index->getOperand(0);
	if (Index->getOpcode() == ISD::SIGN_EXTEND) {
	Index = Index->getOperand(0);
	IsIndexSignExt = true;
	} else
	IsIndexSignExt = false;
	Base = PotentialBase;
	}
	return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
	}
	Index: vendor/llvm/dist-release_60/lib/CodeGen/TargetLoweringBase.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/CodeGen/TargetLoweringBase.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/CodeGen/TargetLoweringBase.cpp (revision 328362)
	@@ -1,1800 +1,1809 @@
	//===- TargetLoweringBase.cpp - Implement the TargetLoweringBase class ----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements the TargetLoweringBase class.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetMachine.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <iterator>
	#include <string>
	#include <tuple>
	#include <utility>

	using namespace llvm;

	static cl::opt<bool> JumpIsExpensiveOverride(
	"jump-is-expensive", cl::init(false),
	cl::desc("Do not create extra branches to split comparison logic."),
	cl::Hidden);

	static cl::opt<unsigned> MinimumJumpTableEntries
	("min-jump-table-entries", cl::init(4), cl::Hidden,
	cl::desc("Set minimum number of entries to use a jump table."));

	static cl::opt<unsigned> MaximumJumpTableSize
	("max-jump-table-size", cl::init(0), cl::Hidden,
	cl::desc("Set maximum size of jump tables; zero for no limit."));

	/// Minimum jump table density for normal functions.
	static cl::opt<unsigned>
	JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
	cl::desc("Minimum density for building a jump table in "
	"a normal function"));

	/// Minimum jump table density for -Os or -Oz functions.
	static cl::opt<unsigned> OptsizeJumpTableDensity(
	"optsize-jump-table-density", cl::init(40), cl::Hidden,
	cl::desc("Minimum density for building a jump table in "
	"an optsize function"));

	static bool darwinHasSinCos(const Triple &TT) {
	assert(TT.isOSDarwin() && "should be called with darwin triple");
	// Don't bother with 32 bit x86.
	if (TT.getArch() == Triple::x86)
	return false;
	// Macos < 10.9 has no sincos_stret.
	if (TT.isMacOSX())
	return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit();
	// iOS < 7.0 has no sincos_stret.
	if (TT.isiOS())
	return !TT.isOSVersionLT(7, 0);
	// Any other darwin such as WatchOS/TvOS is new enough.
	return true;
	}

	// Although this default value is arbitrary, it is not random. It is assumed
	// that a condition that evaluates the same way by a higher percentage than this
	// is best represented as control flow. Therefore, the default value N should be
	// set such that the win from N% correct executions is greater than the loss
	// from (100 - N)% mispredicted executions for the majority of intended targets.
	static cl::opt<int> MinPercentageForPredictableBranch(
	"min-predictable-branch", cl::init(99),
	cl::desc("Minimum percentage (0-100) that a condition must be either true "
	"or false to assume that the condition is predictable"),
	cl::Hidden);

	void TargetLoweringBase::InitLibcalls(const Triple &TT) {
	#define HANDLE_LIBCALL(code, name) \
	setLibcallName(RTLIB::code, name);
	#include "llvm/CodeGen/RuntimeLibcalls.def"
	#undef HANDLE_LIBCALL
	// Initialize calling conventions to their default.
	for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
	setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);

	// A few names are different on particular architectures or environments.
	if (TT.isOSDarwin()) {
	// For f16/f32 conversions, Darwin uses the standard naming scheme, instead
	// of the gnueabi-style __gnu_*_ieee.
	// FIXME: What about other targets?
	setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
	setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

	- // Darwin 10 and higher has an optimized __bzero.
	- if (!TT.isMacOSX() \|\| !TT.isMacOSXVersionLT(10, 6) \|\| TT.isArch64Bit()) {
	- setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero");
	+ // Some darwins have an optimized __bzero/bzero function.
	+ switch (TT.getArch()) {
	+ case Triple::x86:
	+ case Triple::x86_64:
	+ if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
	+ setLibcallName(RTLIB::BZERO, "__bzero");
	+ break;
	+ case Triple::aarch64:
	+ setLibcallName(RTLIB::BZERO, "bzero");
	+ break;
	+ default:
	+ break;
	}

	if (darwinHasSinCos(TT)) {
	setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
	setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
	if (TT.isWatchABI()) {
	setLibcallCallingConv(RTLIB::SINCOS_STRET_F32,
	CallingConv::ARM_AAPCS_VFP);
	setLibcallCallingConv(RTLIB::SINCOS_STRET_F64,
	CallingConv::ARM_AAPCS_VFP);
	}
	}
	} else {
	setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
	setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
	}

	if (TT.isGNUEnvironment() \|\| TT.isOSFuchsia()) {
	setLibcallName(RTLIB::SINCOS_F32, "sincosf");
	setLibcallName(RTLIB::SINCOS_F64, "sincos");
	setLibcallName(RTLIB::SINCOS_F80, "sincosl");
	setLibcallName(RTLIB::SINCOS_F128, "sincosl");
	setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
	}

	if (TT.isOSOpenBSD()) {
	setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
	}
	}

	/// getFPEXT - Return the FPEXT__ value for the given types, or
	/// UNKNOWN_LIBCALL if there is none.
	RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
	if (OpVT == MVT::f16) {
	if (RetVT == MVT::f32)
	return FPEXT_F16_F32;
	} else if (OpVT == MVT::f32) {
	if (RetVT == MVT::f64)
	return FPEXT_F32_F64;
	if (RetVT == MVT::f128)
	return FPEXT_F32_F128;
	if (RetVT == MVT::ppcf128)
	return FPEXT_F32_PPCF128;
	} else if (OpVT == MVT::f64) {
	if (RetVT == MVT::f128)
	return FPEXT_F64_F128;
	else if (RetVT == MVT::ppcf128)
	return FPEXT_F64_PPCF128;
	}

	return UNKNOWN_LIBCALL;
	}

	/// getFPROUND - Return the FPROUND__ value for the given types, or
	/// UNKNOWN_LIBCALL if there is none.
	RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
	if (RetVT == MVT::f16) {
	if (OpVT == MVT::f32)
	return FPROUND_F32_F16;
	if (OpVT == MVT::f64)
	return FPROUND_F64_F16;
	if (OpVT == MVT::f80)
	return FPROUND_F80_F16;
	if (OpVT == MVT::f128)
	return FPROUND_F128_F16;
	if (OpVT == MVT::ppcf128)
	return FPROUND_PPCF128_F16;
	} else if (RetVT == MVT::f32) {
	if (OpVT == MVT::f64)
	return FPROUND_F64_F32;
	if (OpVT == MVT::f80)
	return FPROUND_F80_F32;
	if (OpVT == MVT::f128)
	return FPROUND_F128_F32;
	if (OpVT == MVT::ppcf128)
	return FPROUND_PPCF128_F32;
	} else if (RetVT == MVT::f64) {
	if (OpVT == MVT::f80)
	return FPROUND_F80_F64;
	if (OpVT == MVT::f128)
	return FPROUND_F128_F64;
	if (OpVT == MVT::ppcf128)
	return FPROUND_PPCF128_F64;
	}

	return UNKNOWN_LIBCALL;
	}

	/// getFPTOSINT - Return the FPTOSINT__ value for the given types, or
	/// UNKNOWN_LIBCALL if there is none.
	RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
	if (OpVT == MVT::f32) {
	if (RetVT == MVT::i32)
	return FPTOSINT_F32_I32;
	if (RetVT == MVT::i64)
	return FPTOSINT_F32_I64;
	if (RetVT == MVT::i128)
	return FPTOSINT_F32_I128;
	} else if (OpVT == MVT::f64) {
	if (RetVT == MVT::i32)
	return FPTOSINT_F64_I32;
	if (RetVT == MVT::i64)
	return FPTOSINT_F64_I64;
	if (RetVT == MVT::i128)
	return FPTOSINT_F64_I128;
	} else if (OpVT == MVT::f80) {
	if (RetVT == MVT::i32)
	return FPTOSINT_F80_I32;
	if (RetVT == MVT::i64)
	return FPTOSINT_F80_I64;
	if (RetVT == MVT::i128)
	return FPTOSINT_F80_I128;
	} else if (OpVT == MVT::f128) {
	if (RetVT == MVT::i32)
	return FPTOSINT_F128_I32;
	if (RetVT == MVT::i64)
	return FPTOSINT_F128_I64;
	if (RetVT == MVT::i128)
	return FPTOSINT_F128_I128;
	} else if (OpVT == MVT::ppcf128) {
	if (RetVT == MVT::i32)
	return FPTOSINT_PPCF128_I32;
	if (RetVT == MVT::i64)
	return FPTOSINT_PPCF128_I64;
	if (RetVT == MVT::i128)
	return FPTOSINT_PPCF128_I128;
	}
	return UNKNOWN_LIBCALL;
	}

	/// getFPTOUINT - Return the FPTOUINT__ value for the given types, or
	/// UNKNOWN_LIBCALL if there is none.
	RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
	if (OpVT == MVT::f32) {
	if (RetVT == MVT::i32)
	return FPTOUINT_F32_I32;
	if (RetVT == MVT::i64)
	return FPTOUINT_F32_I64;
	if (RetVT == MVT::i128)
	return FPTOUINT_F32_I128;
	} else if (OpVT == MVT::f64) {
	if (RetVT == MVT::i32)
	return FPTOUINT_F64_I32;
	if (RetVT == MVT::i64)
	return FPTOUINT_F64_I64;
	if (RetVT == MVT::i128)
	return FPTOUINT_F64_I128;
	} else if (OpVT == MVT::f80) {
	if (RetVT == MVT::i32)
	return FPTOUINT_F80_I32;
	if (RetVT == MVT::i64)
	return FPTOUINT_F80_I64;
	if (RetVT == MVT::i128)
	return FPTOUINT_F80_I128;
	} else if (OpVT == MVT::f128) {
	if (RetVT == MVT::i32)
	return FPTOUINT_F128_I32;
	if (RetVT == MVT::i64)
	return FPTOUINT_F128_I64;
	if (RetVT == MVT::i128)
	return FPTOUINT_F128_I128;
	} else if (OpVT == MVT::ppcf128) {
	if (RetVT == MVT::i32)
	return FPTOUINT_PPCF128_I32;
	if (RetVT == MVT::i64)
	return FPTOUINT_PPCF128_I64;
	if (RetVT == MVT::i128)
	return FPTOUINT_PPCF128_I128;
	}
	return UNKNOWN_LIBCALL;
	}

	/// getSINTTOFP - Return the SINTTOFP__ value for the given types, or
	/// UNKNOWN_LIBCALL if there is none.
	RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
	if (OpVT == MVT::i32) {
	if (RetVT == MVT::f32)
	return SINTTOFP_I32_F32;
	if (RetVT == MVT::f64)
	return SINTTOFP_I32_F64;
	if (RetVT == MVT::f80)
	return SINTTOFP_I32_F80;
	if (RetVT == MVT::f128)
	return SINTTOFP_I32_F128;
	if (RetVT == MVT::ppcf128)
	return SINTTOFP_I32_PPCF128;
	} else if (OpVT == MVT::i64) {
	if (RetVT == MVT::f32)
	return SINTTOFP_I64_F32;
	if (RetVT == MVT::f64)
	return SINTTOFP_I64_F64;
	if (RetVT == MVT::f80)
	return SINTTOFP_I64_F80;
	if (RetVT == MVT::f128)
	return SINTTOFP_I64_F128;
	if (RetVT == MVT::ppcf128)
	return SINTTOFP_I64_PPCF128;
	} else if (OpVT == MVT::i128) {
	if (RetVT == MVT::f32)
	return SINTTOFP_I128_F32;
	if (RetVT == MVT::f64)
	return SINTTOFP_I128_F64;
	if (RetVT == MVT::f80)
	return SINTTOFP_I128_F80;
	if (RetVT == MVT::f128)
	return SINTTOFP_I128_F128;
	if (RetVT == MVT::ppcf128)
	return SINTTOFP_I128_PPCF128;
	}
	return UNKNOWN_LIBCALL;
	}

	/// getUINTTOFP - Return the UINTTOFP__ value for the given types, or
	/// UNKNOWN_LIBCALL if there is none.
	RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
	if (OpVT == MVT::i32) {
	if (RetVT == MVT::f32)
	return UINTTOFP_I32_F32;
	if (RetVT == MVT::f64)
	return UINTTOFP_I32_F64;
	if (RetVT == MVT::f80)
	return UINTTOFP_I32_F80;
	if (RetVT == MVT::f128)
	return UINTTOFP_I32_F128;
	if (RetVT == MVT::ppcf128)
	return UINTTOFP_I32_PPCF128;
	} else if (OpVT == MVT::i64) {
	if (RetVT == MVT::f32)
	return UINTTOFP_I64_F32;
	if (RetVT == MVT::f64)
	return UINTTOFP_I64_F64;
	if (RetVT == MVT::f80)
	return UINTTOFP_I64_F80;
	if (RetVT == MVT::f128)
	return UINTTOFP_I64_F128;
	if (RetVT == MVT::ppcf128)
	return UINTTOFP_I64_PPCF128;
	} else if (OpVT == MVT::i128) {
	if (RetVT == MVT::f32)
	return UINTTOFP_I128_F32;
	if (RetVT == MVT::f64)
	return UINTTOFP_I128_F64;
	if (RetVT == MVT::f80)
	return UINTTOFP_I128_F80;
	if (RetVT == MVT::f128)
	return UINTTOFP_I128_F128;
	if (RetVT == MVT::ppcf128)
	return UINTTOFP_I128_PPCF128;
	}
	return UNKNOWN_LIBCALL;
	}

	RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) {
	#define OP_TO_LIBCALL(Name, Enum) \
	case Name: \
	switch (VT.SimpleTy) { \
	default: \
	return UNKNOWN_LIBCALL; \
	case MVT::i8: \
	return Enum##_1; \
	case MVT::i16: \
	return Enum##_2; \
	case MVT::i32: \
	return Enum##_4; \
	case MVT::i64: \
	return Enum##_8; \
	case MVT::i128: \
	return Enum##_16; \
	}

	switch (Opc) {
	OP_TO_LIBCALL(ISD::ATOMIC_SWAP, SYNC_LOCK_TEST_AND_SET)
	OP_TO_LIBCALL(ISD::ATOMIC_CMP_SWAP, SYNC_VAL_COMPARE_AND_SWAP)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_ADD, SYNC_FETCH_AND_ADD)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_SUB, SYNC_FETCH_AND_SUB)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_AND, SYNC_FETCH_AND_AND)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_OR, SYNC_FETCH_AND_OR)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_XOR, SYNC_FETCH_AND_XOR)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_NAND, SYNC_FETCH_AND_NAND)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_MAX, SYNC_FETCH_AND_MAX)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_UMAX, SYNC_FETCH_AND_UMAX)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_MIN, SYNC_FETCH_AND_MIN)
	OP_TO_LIBCALL(ISD::ATOMIC_LOAD_UMIN, SYNC_FETCH_AND_UMIN)
	}

	#undef OP_TO_LIBCALL

	return UNKNOWN_LIBCALL;
	}

	RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
	switch (ElementSize) {
	case 1:
	return MEMCPY_ELEMENT_UNORDERED_ATOMIC_1;
	case 2:
	return MEMCPY_ELEMENT_UNORDERED_ATOMIC_2;
	case 4:
	return MEMCPY_ELEMENT_UNORDERED_ATOMIC_4;
	case 8:
	return MEMCPY_ELEMENT_UNORDERED_ATOMIC_8;
	case 16:
	return MEMCPY_ELEMENT_UNORDERED_ATOMIC_16;
	default:
	return UNKNOWN_LIBCALL;
	}
	}

	RTLIB::Libcall RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
	switch (ElementSize) {
	case 1:
	return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1;
	case 2:
	return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2;
	case 4:
	return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4;
	case 8:
	return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8;
	case 16:
	return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16;
	default:
	return UNKNOWN_LIBCALL;
	}
	}

	RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
	switch (ElementSize) {
	case 1:
	return MEMSET_ELEMENT_UNORDERED_ATOMIC_1;
	case 2:
	return MEMSET_ELEMENT_UNORDERED_ATOMIC_2;
	case 4:
	return MEMSET_ELEMENT_UNORDERED_ATOMIC_4;
	case 8:
	return MEMSET_ELEMENT_UNORDERED_ATOMIC_8;
	case 16:
	return MEMSET_ELEMENT_UNORDERED_ATOMIC_16;
	default:
	return UNKNOWN_LIBCALL;
	}
	}

	/// InitCmpLibcallCCs - Set default comparison libcall CC.
	static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
	memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
	CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
	CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
	CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
	CCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
	CCs[RTLIB::UNE_F32] = ISD::SETNE;
	CCs[RTLIB::UNE_F64] = ISD::SETNE;
	CCs[RTLIB::UNE_F128] = ISD::SETNE;
	CCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
	CCs[RTLIB::OGE_F32] = ISD::SETGE;
	CCs[RTLIB::OGE_F64] = ISD::SETGE;
	CCs[RTLIB::OGE_F128] = ISD::SETGE;
	CCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
	CCs[RTLIB::OLT_F32] = ISD::SETLT;
	CCs[RTLIB::OLT_F64] = ISD::SETLT;
	CCs[RTLIB::OLT_F128] = ISD::SETLT;
	CCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
	CCs[RTLIB::OLE_F32] = ISD::SETLE;
	CCs[RTLIB::OLE_F64] = ISD::SETLE;
	CCs[RTLIB::OLE_F128] = ISD::SETLE;
	CCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
	CCs[RTLIB::OGT_F32] = ISD::SETGT;
	CCs[RTLIB::OGT_F64] = ISD::SETGT;
	CCs[RTLIB::OGT_F128] = ISD::SETGT;
	CCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
	CCs[RTLIB::UO_F32] = ISD::SETNE;
	CCs[RTLIB::UO_F64] = ISD::SETNE;
	CCs[RTLIB::UO_F128] = ISD::SETNE;
	CCs[RTLIB::UO_PPCF128] = ISD::SETNE;
	CCs[RTLIB::O_F32] = ISD::SETEQ;
	CCs[RTLIB::O_F64] = ISD::SETEQ;
	CCs[RTLIB::O_F128] = ISD::SETEQ;
	CCs[RTLIB::O_PPCF128] = ISD::SETEQ;
	}

	/// NOTE: The TargetMachine owns TLOF.
	TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
	initActions();

	// Perform these initializations only once.
	MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
	MaxLoadsPerMemcmp = 8;
	MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
	MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
	UseUnderscoreSetJmp = false;
	UseUnderscoreLongJmp = false;
	HasMultipleConditionRegisters = false;
	HasExtractBitsInsn = false;
	JumpIsExpensive = JumpIsExpensiveOverride;
	PredictableSelectIsExpensive = false;
	EnableExtLdPromotion = false;
	HasFloatingPointExceptions = true;
	StackPointerRegisterToSaveRestore = 0;
	BooleanContents = UndefinedBooleanContent;
	BooleanFloatContents = UndefinedBooleanContent;
	BooleanVectorContents = UndefinedBooleanContent;
	SchedPreferenceInfo = Sched::ILP;
	JumpBufSize = 0;
	JumpBufAlignment = 0;
	MinFunctionAlignment = 0;
	PrefFunctionAlignment = 0;
	PrefLoopAlignment = 0;
	GatherAllAliasesMaxDepth = 18;
	MinStackArgumentAlignment = 1;
	// TODO: the default will be switched to 0 in the next commit, along
	// with the Target-specific changes necessary.
	MaxAtomicSizeInBitsSupported = 1024;

	MinCmpXchgSizeInBits = 0;
	SupportsUnalignedAtomics = false;

	std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);

	InitLibcalls(TM.getTargetTriple());
	InitCmpLibcallCCs(CmpLibcallCCs);
	}

	void TargetLoweringBase::initActions() {
	// All operations default to being supported.
	memset(OpActions, 0, sizeof(OpActions));
	memset(LoadExtActions, 0, sizeof(LoadExtActions));
	memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
	memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
	memset(CondCodeActions, 0, sizeof(CondCodeActions));
	std::fill(std::begin(RegClassForVT), std::end(RegClassForVT), nullptr);
	std::fill(std::begin(TargetDAGCombineArray),
	std::end(TargetDAGCombineArray), 0);

	// Set default actions for various operations.
	for (MVT VT : MVT::all_valuetypes()) {
	// Default all indexed load / store to expand.
	for (unsigned IM = (unsigned)ISD::PRE_INC;
	IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
	setIndexedLoadAction(IM, VT, Expand);
	setIndexedStoreAction(IM, VT, Expand);
	}

	// Most backends expect to see the node which just returns the value loaded.
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);

	// These operations default to expand.
	setOperationAction(ISD::FGETSIGN, VT, Expand);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
	setOperationAction(ISD::FMINNUM, VT, Expand);
	setOperationAction(ISD::FMAXNUM, VT, Expand);
	setOperationAction(ISD::FMINNAN, VT, Expand);
	setOperationAction(ISD::FMAXNAN, VT, Expand);
	setOperationAction(ISD::FMAD, VT, Expand);
	setOperationAction(ISD::SMIN, VT, Expand);
	setOperationAction(ISD::SMAX, VT, Expand);
	setOperationAction(ISD::UMIN, VT, Expand);
	setOperationAction(ISD::UMAX, VT, Expand);
	setOperationAction(ISD::ABS, VT, Expand);

	// Overflow operations default to expand
	setOperationAction(ISD::SADDO, VT, Expand);
	setOperationAction(ISD::SSUBO, VT, Expand);
	setOperationAction(ISD::UADDO, VT, Expand);
	setOperationAction(ISD::USUBO, VT, Expand);
	setOperationAction(ISD::SMULO, VT, Expand);
	setOperationAction(ISD::UMULO, VT, Expand);

	// ADDCARRY operations default to expand
	setOperationAction(ISD::ADDCARRY, VT, Expand);
	setOperationAction(ISD::SUBCARRY, VT, Expand);
	setOperationAction(ISD::SETCCCARRY, VT, Expand);

	// These default to Expand so they will be expanded to CTLZ/CTTZ by default.
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);

	setOperationAction(ISD::BITREVERSE, VT, Expand);

	// These library functions default to expand.
	setOperationAction(ISD::FROUND, VT, Expand);
	setOperationAction(ISD::FPOWI, VT, Expand);

	// These operations default to expand for vector types.
	if (VT.isVector()) {
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
	}

	// For most targets @llvm.get.dynamic.area.offset just returns 0.
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
	}

	// Most targets ignore the @llvm.prefetch intrinsic.
	setOperationAction(ISD::PREFETCH, MVT::Other, Expand);

	// Most targets also ignore the @llvm.readcyclecounter intrinsic.
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand);

	// ConstantFP nodes default to expand. Targets can either change this to
	// Legal, in which case all fp constants are legal, or use isFPImmLegal()
	// to optimize expansions for certain constants.
	setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
	setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
	setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
	setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
	setOperationAction(ISD::ConstantFP, MVT::f128, Expand);

	// These library functions default to expand.
	for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
	setOperationAction(ISD::FLOG , VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP , VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FROUND, VT, Expand);
	}

	// Default ISD::TRAP to expand (which turns it into abort).
	setOperationAction(ISD::TRAP, MVT::Other, Expand);

	// On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
	// here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
	}

	MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::getIntegerVT(8 * DL.getPointerSize(0));
	}

	EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy,
	const DataLayout &DL) const {
	assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
	if (LHSTy.isVector())
	return LHSTy;
	return getScalarShiftAmountTy(DL, LHSTy);
	}

	bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
	assert(isTypeLegal(VT));
	switch (Op) {
	default:
	return false;
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	return true;
	}
	}

	void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
	// If the command-line option was specified, ignore this request.
	if (!JumpIsExpensiveOverride.getNumOccurrences())
	JumpIsExpensive = isExpensive;
	}

	TargetLoweringBase::LegalizeKind
	TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
	// If this is a simple type, use the ComputeRegisterProp mechanism.
	if (VT.isSimple()) {
	MVT SVT = VT.getSimpleVT();
	assert((unsigned)SVT.SimpleTy < array_lengthof(TransformToType));
	MVT NVT = TransformToType[SVT.SimpleTy];
	LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);

	assert((LA == TypeLegal \|\| LA == TypeSoftenFloat \|\|
	ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) &&
	"Promote may not follow Expand or Promote");

	if (LA == TypeSplitVector)
	return LegalizeKind(LA,
	EVT::getVectorVT(Context, SVT.getVectorElementType(),
	SVT.getVectorNumElements() / 2));
	if (LA == TypeScalarizeVector)
	return LegalizeKind(LA, SVT.getVectorElementType());
	return LegalizeKind(LA, NVT);
	}

	// Handle Extended Scalar Types.
	if (!VT.isVector()) {
	assert(VT.isInteger() && "Float types must be simple");
	unsigned BitSize = VT.getSizeInBits();
	// First promote to a power-of-two size, then expand if necessary.
	if (BitSize < 8 \|\| !isPowerOf2_32(BitSize)) {
	EVT NVT = VT.getRoundIntegerType(Context);
	assert(NVT != VT && "Unable to round integer VT");
	LegalizeKind NextStep = getTypeConversion(Context, NVT);
	// Avoid multi-step promotion.
	if (NextStep.first == TypePromoteInteger)
	return NextStep;
	// Return rounded integer type.
	return LegalizeKind(TypePromoteInteger, NVT);
	}

	return LegalizeKind(TypeExpandInteger,
	EVT::getIntegerVT(Context, VT.getSizeInBits() / 2));
	}

	// Handle vector types.
	unsigned NumElts = VT.getVectorNumElements();
	EVT EltVT = VT.getVectorElementType();

	// Vectors with only one element are always scalarized.
	if (NumElts == 1)
	return LegalizeKind(TypeScalarizeVector, EltVT);

	// Try to widen vector elements until the element type is a power of two and
	// promote it to a legal type later on, for example:
	// <3 x i8> -> <4 x i8> -> <4 x i32>
	if (EltVT.isInteger()) {
	// Vectors with a number of elements that is not a power of two are always
	// widened, for example <3 x i8> -> <4 x i8>.
	if (!VT.isPow2VectorType()) {
	NumElts = (unsigned)NextPowerOf2(NumElts);
	EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
	return LegalizeKind(TypeWidenVector, NVT);
	}

	// Examine the element type.
	LegalizeKind LK = getTypeConversion(Context, EltVT);

	// If type is to be expanded, split the vector.
	// <4 x i140> -> <2 x i140>
	if (LK.first == TypeExpandInteger)
	return LegalizeKind(TypeSplitVector,
	EVT::getVectorVT(Context, EltVT, NumElts / 2));

	// Promote the integer element types until a legal vector type is found
	// or until the element integer type is too big. If a legal type was not
	// found, fallback to the usual mechanism of widening/splitting the
	// vector.
	EVT OldEltVT = EltVT;
	while (true) {
	// Increase the bitwidth of the element to the next pow-of-two
	// (which is greater than 8 bits).
	EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits())
	.getRoundIntegerType(Context);

	// Stop trying when getting a non-simple element type.
	// Note that vector elements may be greater than legal vector element
	// types. Example: X86 XMM registers hold 64bit element on 32bit
	// systems.
	if (!EltVT.isSimple())
	break;

	// Build a new vector type and check if it is legal.
	MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
	// Found a legal promoted vector type.
	if (NVT != MVT() && ValueTypeActions.getTypeAction(NVT) == TypeLegal)
	return LegalizeKind(TypePromoteInteger,
	EVT::getVectorVT(Context, EltVT, NumElts));
	}

	// Reset the type to the unexpanded type if we did not find a legal vector
	// type with a promoted vector element type.
	EltVT = OldEltVT;
	}

	// Try to widen the vector until a legal type is found.
	// If there is no wider legal type, split the vector.
	while (true) {
	// Round up to the next power of 2.
	NumElts = (unsigned)NextPowerOf2(NumElts);

	// If there is no simple vector type with this many elements then there
	// cannot be a larger legal vector type. Note that this assumes that
	// there are no skipped intermediate vector types in the simple types.
	if (!EltVT.isSimple())
	break;
	MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
	if (LargerVector == MVT())
	break;

	// If this type is legal then widen the vector.
	if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
	return LegalizeKind(TypeWidenVector, LargerVector);
	}

	// Widen odd vectors to next power of two.
	if (!VT.isPow2VectorType()) {
	EVT NVT = VT.getPow2VectorType(Context);
	return LegalizeKind(TypeWidenVector, NVT);
	}

	// Vectors with illegal element types are expanded.
	EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
	return LegalizeKind(TypeSplitVector, NVT);
	}

	static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
	unsigned &NumIntermediates,
	MVT &RegisterVT,
	TargetLoweringBase *TLI) {
	// Figure out the right, legal destination reg to copy into.
	unsigned NumElts = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType();

	unsigned NumVectorRegs = 1;

	// FIXME: We don't support non-power-of-2-sized vectors for now. Ideally we
	// could break down into LHS/RHS like LegalizeDAG does.
	if (!isPowerOf2_32(NumElts)) {
	NumVectorRegs = NumElts;
	NumElts = 1;
	}

	// Divide the input until we get to a supported size. This will always
	// end with a scalar if the target doesn't support vectors.
	while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
	NumElts >>= 1;
	NumVectorRegs <<= 1;
	}

	NumIntermediates = NumVectorRegs;

	MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
	if (!TLI->isTypeLegal(NewVT))
	NewVT = EltTy;
	IntermediateVT = NewVT;

	unsigned NewVTSize = NewVT.getSizeInBits();

	// Convert sizes such as i33 to i64.
	if (!isPowerOf2_32(NewVTSize))
	NewVTSize = NextPowerOf2(NewVTSize);

	MVT DestVT = TLI->getRegisterType(NewVT);
	RegisterVT = DestVT;
	if (EVT(DestVT).bitsLT(NewVT)) // Value is expanded, e.g. i64 -> i16.
	return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());

	// Otherwise, promotion or legal types use the same number of registers as
	// the vector decimated to the appropriate level.
	return NumVectorRegs;
	}

	/// isLegalRC - Return true if the value types that can be represented by the
	/// specified register class are all legal.
	bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI,
	const TargetRegisterClass &RC) const {
	for (auto I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I)
	if (isTypeLegal(*I))
	return true;
	return false;
	}

	/// Replace/modify any TargetFrameIndex operands with a targte-dependent
	/// sequence of memory operands that is recognized by PrologEpilogInserter.
	MachineBasicBlock *
	TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
	MachineBasicBlock *MBB) const {
	MachineInstr *MI = &InitialMI;
	MachineFunction &MF = *MI->getMF();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	// We're handling multiple types of operands here:
	// PATCHPOINT MetaArgs - live-in, read only, direct
	// STATEPOINT Deopt Spill - live-through, read only, indirect
	// STATEPOINT Deopt Alloca - live-through, read only, direct
	// (We're currently conservative and mark the deopt slots read/write in
	// practice.)
	// STATEPOINT GC Spill - live-through, read/write, indirect
	// STATEPOINT GC Alloca - live-through, read/write, direct
	// The live-in vs live-through is handled already (the live through ones are
	// all stack slots), but we need to handle the different type of stackmap
	// operands and memory effects here.

	// MI changes inside this loop as we grow operands.
	for(unsigned OperIdx = 0; OperIdx != MI->getNumOperands(); ++OperIdx) {
	MachineOperand &MO = MI->getOperand(OperIdx);
	if (!MO.isFI())
	continue;

	// foldMemoryOperand builds a new MI after replacing a single FI operand
	// with the canonical set of five x86 addressing-mode operands.
	int FI = MO.getIndex();
	MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), MI->getDesc());

	// Copy operands before the frame-index.
	for (unsigned i = 0; i < OperIdx; ++i)
	MIB.add(MI->getOperand(i));
	// Add frame index operands recognized by stackmaps.cpp
	if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
	// indirect-mem-ref tag, size, #FI, offset.
	// Used for spills inserted by StatepointLowering. This codepath is not
	// used for patchpoints/stackmaps at all, for these spilling is done via
	// foldMemoryOperand callback only.
	assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
	MIB.addImm(StackMaps::IndirectMemRefOp);
	MIB.addImm(MFI.getObjectSize(FI));
	MIB.add(MI->getOperand(OperIdx));
	MIB.addImm(0);
	} else {
	// direct-mem-ref tag, #FI, offset.
	// Used by patchpoint, and direct alloca arguments to statepoints
	MIB.addImm(StackMaps::DirectMemRefOp);
	MIB.add(MI->getOperand(OperIdx));
	MIB.addImm(0);
	}
	// Copy the operands after the frame index.
	for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
	MIB.add(MI->getOperand(i));

	// Inherit previous memory operands.
	MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
	assert(MIB->mayLoad() && "Folded a stackmap use to a non-load!");

	// Add a new memory operand for this FI.
	assert(MFI.getObjectOffset(FI) != -1);

	auto Flags = MachineMemOperand::MOLoad;
	if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
	Flags \|= MachineMemOperand::MOStore;
	Flags \|= MachineMemOperand::MOVolatile;
	}
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FI), Flags,
	MF.getDataLayout().getPointerSize(), MFI.getObjectAlignment(FI));
	MIB->addMemOperand(MF, MMO);

	// Replace the instruction and update the operand index.
	MBB->insert(MachineBasicBlock::iterator(MI), MIB);
	OperIdx += (MIB->getNumOperands() - MI->getNumOperands()) - 1;
	MI->eraseFromParent();
	MI = MIB;
	}
	return MBB;
	}

	/// findRepresentativeClass - Return the largest legal super-reg register class
	/// of the register class for the specified type and its associated "cost".
	// This function is in TargetLowering because it uses RegClassForVT which would
	// need to be moved to TargetRegisterInfo and would necessitate moving
	// isTypeLegal over as well - a massive change that would just require
	// TargetLowering having a TargetRegisterInfo class member that it would use.
	std::pair<const TargetRegisterClass *, uint8_t>
	TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
	if (!RC)
	return std::make_pair(RC, 0);

	// Compute the set of all super-register classes.
	BitVector SuperRegRC(TRI->getNumRegClasses());
	for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
	SuperRegRC.setBitsInMask(RCI.getMask());

	// Find the first legal register class with the largest spill size.
	const TargetRegisterClass *BestRC = RC;
	for (unsigned i : SuperRegRC.set_bits()) {
	const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
	// We want the largest possible spill size.
	if (TRI->getSpillSize(SuperRC) <= TRI->getSpillSize(BestRC))
	continue;
	if (!isLegalRC(TRI, SuperRC))
	continue;
	BestRC = SuperRC;
	}
	return std::make_pair(BestRC, 1);
	}

	/// computeRegisterProperties - Once all of the register classes are added,
	/// this allows us to compute derived properties we expose.
	void TargetLoweringBase::computeRegisterProperties(
	const TargetRegisterInfo *TRI) {
	static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
	"Too many value types for ValueTypeActions to hold!");

	// Everything defaults to needing one register.
	for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
	NumRegistersForVT[i] = 1;
	RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
	}
	// ...except isVoid, which doesn't need any registers.
	NumRegistersForVT[MVT::isVoid] = 0;

	// Find the largest integer register class.
	unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
	for (; RegClassForVT[LargestIntReg] == nullptr; --LargestIntReg)
	assert(LargestIntReg != MVT::i1 && "No integer registers defined!");

	// Every integer value type larger than this largest register takes twice as
	// many registers to represent as the previous ValueType.
	for (unsigned ExpandedReg = LargestIntReg + 1;
	ExpandedReg <= MVT::LAST_INTEGER_VALUETYPE; ++ExpandedReg) {
	NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
	RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
	TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
	ValueTypeActions.setTypeAction((MVT::SimpleValueType)ExpandedReg,
	TypeExpandInteger);
	}

	// Inspect all of the ValueType's smaller than the largest integer
	// register to see which ones need promotion.
	unsigned LegalIntReg = LargestIntReg;
	for (unsigned IntReg = LargestIntReg - 1;
	IntReg >= (unsigned)MVT::i1; --IntReg) {
	MVT IVT = (MVT::SimpleValueType)IntReg;
	if (isTypeLegal(IVT)) {
	LegalIntReg = IntReg;
	} else {
	RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
	(const MVT::SimpleValueType)LegalIntReg;
	ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
	}
	}

	// ppcf128 type is really two f64's.
	if (!isTypeLegal(MVT::ppcf128)) {
	if (isTypeLegal(MVT::f64)) {
	NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
	RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
	TransformToType[MVT::ppcf128] = MVT::f64;
	ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
	} else {
	NumRegistersForVT[MVT::ppcf128] = NumRegistersForVT[MVT::i128];
	RegisterTypeForVT[MVT::ppcf128] = RegisterTypeForVT[MVT::i128];
	TransformToType[MVT::ppcf128] = MVT::i128;
	ValueTypeActions.setTypeAction(MVT::ppcf128, TypeSoftenFloat);
	}
	}

	// Decide how to handle f128. If the target does not have native f128 support,
	// expand it to i128 and we will be generating soft float library calls.
	if (!isTypeLegal(MVT::f128)) {
	NumRegistersForVT[MVT::f128] = NumRegistersForVT[MVT::i128];
	RegisterTypeForVT[MVT::f128] = RegisterTypeForVT[MVT::i128];
	TransformToType[MVT::f128] = MVT::i128;
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	}

	// Decide how to handle f64. If the target does not have native f64 support,
	// expand it to i64 and we will be generating soft float library calls.
	if (!isTypeLegal(MVT::f64)) {
	NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
	RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
	TransformToType[MVT::f64] = MVT::i64;
	ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
	}

	// Decide how to handle f32. If the target does not have native f32 support,
	// expand it to i32 and we will be generating soft float library calls.
	if (!isTypeLegal(MVT::f32)) {
	NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
	RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
	TransformToType[MVT::f32] = MVT::i32;
	ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
	}

	// Decide how to handle f16. If the target does not have native f16 support,
	// promote it to f32, because there are no f16 library calls (except for
	// conversions).
	if (!isTypeLegal(MVT::f16)) {
	NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
	RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
	TransformToType[MVT::f16] = MVT::f32;
	ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
	}

	// Loop over all of the vector value types to see which need transformations.
	for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
	i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
	MVT VT = (MVT::SimpleValueType) i;
	if (isTypeLegal(VT))
	continue;

	MVT EltVT = VT.getVectorElementType();
	unsigned NElts = VT.getVectorNumElements();
	bool IsLegalWiderType = false;
	LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
	switch (PreferredAction) {
	case TypePromoteInteger:
	// Try to promote the elements of integer vectors. If no legal
	// promotion was found, fall through to the widen-vector method.
	for (unsigned nVT = i + 1; nVT <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++nVT) {
	MVT SVT = (MVT::SimpleValueType) nVT;
	// Promote vectors of integers to vectors with the same number
	// of elements, with a wider element type.
	if (SVT.getScalarSizeInBits() > EltVT.getSizeInBits() &&
	SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)) {
	TransformToType[i] = SVT;
	RegisterTypeForVT[i] = SVT;
	NumRegistersForVT[i] = 1;
	ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
	IsLegalWiderType = true;
	break;
	}
	}
	if (IsLegalWiderType)
	break;
	LLVM_FALLTHROUGH;

	case TypeWidenVector:
	// Try to widen the vector.
	for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
	MVT SVT = (MVT::SimpleValueType) nVT;
	if (SVT.getVectorElementType() == EltVT
	&& SVT.getVectorNumElements() > NElts && isTypeLegal(SVT)) {
	TransformToType[i] = SVT;
	RegisterTypeForVT[i] = SVT;
	NumRegistersForVT[i] = 1;
	ValueTypeActions.setTypeAction(VT, TypeWidenVector);
	IsLegalWiderType = true;
	break;
	}
	}
	if (IsLegalWiderType)
	break;
	LLVM_FALLTHROUGH;

	case TypeSplitVector:
	case TypeScalarizeVector: {
	MVT IntermediateVT;
	MVT RegisterVT;
	unsigned NumIntermediates;
	NumRegistersForVT[i] = getVectorTypeBreakdownMVT(VT, IntermediateVT,
	NumIntermediates, RegisterVT, this);
	RegisterTypeForVT[i] = RegisterVT;

	MVT NVT = VT.getPow2VectorType();
	if (NVT == VT) {
	// Type is already a power of 2. The default action is to split.
	TransformToType[i] = MVT::Other;
	if (PreferredAction == TypeScalarizeVector)
	ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
	else if (PreferredAction == TypeSplitVector)
	ValueTypeActions.setTypeAction(VT, TypeSplitVector);
	else
	// Set type action according to the number of elements.
	ValueTypeActions.setTypeAction(VT, NElts == 1 ? TypeScalarizeVector
	: TypeSplitVector);
	} else {
	TransformToType[i] = NVT;
	ValueTypeActions.setTypeAction(VT, TypeWidenVector);
	}
	break;
	}
	default:
	llvm_unreachable("Unknown vector legalization action!");
	}
	}

	// Determine the 'representative' register class for each value type.
	// An representative register class is the largest (meaning one which is
	// not a sub-register class / subreg register class) legal register class for
	// a group of value types. For example, on i386, i8, i16, and i32
	// representative would be GR32; while on x86_64 it's GR64.
	for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
	const TargetRegisterClass* RRC;
	uint8_t Cost;
	std::tie(RRC, Cost) = findRepresentativeClass(TRI, (MVT::SimpleValueType)i);
	RepRegClassForVT[i] = RRC;
	RepRegClassCostForVT[i] = Cost;
	}
	}

	EVT TargetLoweringBase::getSetCCResultType(const DataLayout &DL, LLVMContext &,
	EVT VT) const {
	assert(!VT.isVector() && "No default SetCC type for vectors!");
	return getPointerTy(DL).SimpleTy;
	}

	MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
	return MVT::i32; // return the default value
	}

	/// getVectorTypeBreakdown - Vector types are broken down into some number of
	/// legal first class types. For example, MVT::v8f32 maps to 2 MVT::v4f32
	/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
	/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
	///
	/// This method returns the number of registers needed, and the VT for each
	/// register. It also returns the VT and quantity of the intermediate values
	/// before they are promoted/expanded.
	unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
	EVT &IntermediateVT,
	unsigned &NumIntermediates,
	MVT &RegisterVT) const {
	unsigned NumElts = VT.getVectorNumElements();

	// If there is a wider vector type with the same element type as this one,
	// or a promoted vector type that has the same number of elements which
	// are wider, then we should convert to that legal vector type.
	// This handles things like <2 x float> -> <4 x float> and
	// <4 x i1> -> <4 x i32>.
	LegalizeTypeAction TA = getTypeAction(Context, VT);
	if (NumElts != 1 && (TA == TypeWidenVector \|\| TA == TypePromoteInteger)) {
	EVT RegisterEVT = getTypeToTransformTo(Context, VT);
	if (isTypeLegal(RegisterEVT)) {
	IntermediateVT = RegisterEVT;
	RegisterVT = RegisterEVT.getSimpleVT();
	NumIntermediates = 1;
	return 1;
	}
	}

	// Figure out the right, legal destination reg to copy into.
	EVT EltTy = VT.getVectorElementType();

	unsigned NumVectorRegs = 1;

	// FIXME: We don't support non-power-of-2-sized vectors for now. Ideally we
	// could break down into LHS/RHS like LegalizeDAG does.
	if (!isPowerOf2_32(NumElts)) {
	NumVectorRegs = NumElts;
	NumElts = 1;
	}

	// Divide the input until we get to a supported size. This will always
	// end with a scalar if the target doesn't support vectors.
	while (NumElts > 1 && !isTypeLegal(
	EVT::getVectorVT(Context, EltTy, NumElts))) {
	NumElts >>= 1;
	NumVectorRegs <<= 1;
	}

	NumIntermediates = NumVectorRegs;

	EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
	if (!isTypeLegal(NewVT))
	NewVT = EltTy;
	IntermediateVT = NewVT;

	MVT DestVT = getRegisterType(Context, NewVT);
	RegisterVT = DestVT;
	unsigned NewVTSize = NewVT.getSizeInBits();

	// Convert sizes such as i33 to i64.
	if (!isPowerOf2_32(NewVTSize))
	NewVTSize = NextPowerOf2(NewVTSize);

	if (EVT(DestVT).bitsLT(NewVT)) // Value is expanded, e.g. i64 -> i16.
	return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());

	// Otherwise, promotion or legal types use the same number of registers as
	// the vector decimated to the appropriate level.
	return NumVectorRegs;
	}

	/// Get the EVTs and ArgFlags collections that represent the legalized return
	/// type of the given function. This does not require a DAG or a return value,
	/// and is suitable for use before any DAGs for the function are constructed.
	/// TODO: Move this out of TargetLowering.cpp.
	void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr,
	SmallVectorImpl<ISD::OutputArg> &Outs,
	const TargetLowering &TLI, const DataLayout &DL) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0) return;

	for (unsigned j = 0, f = NumValues; j != f; ++j) {
	EVT VT = ValueVTs[j];
	ISD::NodeType ExtendKind = ISD::ANY_EXTEND;

	if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
	ExtendKind = ISD::SIGN_EXTEND;
	else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
	ExtendKind = ISD::ZERO_EXTEND;

	// FIXME: C calling convention requires the return type to be promoted to
	// at least 32-bit. But this is not necessary for non-C calling
	// conventions. The frontend should mark functions whose return values
	// require promoting with signext or zeroext attributes.
	if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
	MVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
	if (VT.bitsLT(MinVT))
	VT = MinVT;
	}

	unsigned NumParts =
	TLI.getNumRegistersForCallingConv(ReturnType->getContext(), VT);
	MVT PartVT =
	TLI.getRegisterTypeForCallingConv(ReturnType->getContext(), VT);

	// 'inreg' on function refers to return value
	ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
	if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))
	Flags.setInReg();

	// Propagate extension type if any
	if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
	Flags.setSExt();
	else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
	Flags.setZExt();

	for (unsigned i = 0; i < NumParts; ++i)
	Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /isFixed=/true, 0, 0));
	}
	}

	/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. This is the actual
	/// alignment, not its logarithm.
	unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	return DL.getABITypeAlignment(Ty);
	}

	bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
	const DataLayout &DL, EVT VT,
	unsigned AddrSpace,
	unsigned Alignment,
	bool *Fast) const {
	// Check if the specified alignment is sufficient based on the data layout.
	// TODO: While using the data layout works in practice, a better solution
	// would be to implement this check directly (make this a virtual function).
	// For example, the ABI alignment may change based on software platform while
	// this function should only be affected by hardware implementation.
	Type *Ty = VT.getTypeForEVT(Context);
	if (Alignment >= DL.getABITypeAlignment(Ty)) {
	// Assume that an access that meets the ABI-specified alignment is fast.
	if (Fast != nullptr)
	*Fast = true;
	return true;
	}

	// This is a misaligned access.
	return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Fast);
	}

	BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
	return BranchProbability(MinPercentageForPredictableBranch, 100);
	}

	//===----------------------------------------------------------------------===//
	// TargetTransformInfo Helpers
	//===----------------------------------------------------------------------===//

	int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
	enum InstructionOpcodes {
	#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
	#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
	#include "llvm/IR/Instruction.def"
	};
	switch (static_cast<InstructionOpcodes>(Opcode)) {
	case Ret: return 0;
	case Br: return 0;
	case Switch: return 0;
	case IndirectBr: return 0;
	case Invoke: return 0;
	case Resume: return 0;
	case Unreachable: return 0;
	case CleanupRet: return 0;
	case CatchRet: return 0;
	case CatchPad: return 0;
	case CatchSwitch: return 0;
	case CleanupPad: return 0;
	case Add: return ISD::ADD;
	case FAdd: return ISD::FADD;
	case Sub: return ISD::SUB;
	case FSub: return ISD::FSUB;
	case Mul: return ISD::MUL;
	case FMul: return ISD::FMUL;
	case UDiv: return ISD::UDIV;
	case SDiv: return ISD::SDIV;
	case FDiv: return ISD::FDIV;
	case URem: return ISD::UREM;
	case SRem: return ISD::SREM;
	case FRem: return ISD::FREM;
	case Shl: return ISD::SHL;
	case LShr: return ISD::SRL;
	case AShr: return ISD::SRA;
	case And: return ISD::AND;
	case Or: return ISD::OR;
	case Xor: return ISD::XOR;
	case Alloca: return 0;
	case Load: return ISD::LOAD;
	case Store: return ISD::STORE;
	case GetElementPtr: return 0;
	case Fence: return 0;
	case AtomicCmpXchg: return 0;
	case AtomicRMW: return 0;
	case Trunc: return ISD::TRUNCATE;
	case ZExt: return ISD::ZERO_EXTEND;
	case SExt: return ISD::SIGN_EXTEND;
	case FPToUI: return ISD::FP_TO_UINT;
	case FPToSI: return ISD::FP_TO_SINT;
	case UIToFP: return ISD::UINT_TO_FP;
	case SIToFP: return ISD::SINT_TO_FP;
	case FPTrunc: return ISD::FP_ROUND;
	case FPExt: return ISD::FP_EXTEND;
	case PtrToInt: return ISD::BITCAST;
	case IntToPtr: return ISD::BITCAST;
	case BitCast: return ISD::BITCAST;
	case AddrSpaceCast: return ISD::ADDRSPACECAST;
	case ICmp: return ISD::SETCC;
	case FCmp: return ISD::SETCC;
	case PHI: return 0;
	case Call: return 0;
	case Select: return ISD::SELECT;
	case UserOp1: return 0;
	case UserOp2: return 0;
	case VAArg: return 0;
	case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
	case InsertElement: return ISD::INSERT_VECTOR_ELT;
	case ShuffleVector: return ISD::VECTOR_SHUFFLE;
	case ExtractValue: return ISD::MERGE_VALUES;
	case InsertValue: return ISD::MERGE_VALUES;
	case LandingPad: return 0;
	}

	llvm_unreachable("Unknown instruction type encountered!");
	}

	std::pair<int, MVT>
	TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
	Type *Ty) const {
	LLVMContext &C = Ty->getContext();
	EVT MTy = getValueType(DL, Ty);

	int Cost = 1;
	// We keep legalizing the type until we find a legal kind. We assume that
	// the only operation that costs anything is the split. After splitting
	// we need to handle two types.
	while (true) {
	LegalizeKind LK = getTypeConversion(C, MTy);

	if (LK.first == TypeLegal)
	return std::make_pair(Cost, MTy.getSimpleVT());

	if (LK.first == TypeSplitVector \|\| LK.first == TypeExpandInteger)
	Cost *= 2;

	// Do not loop with f128 type.
	if (MTy == LK.second)
	return std::make_pair(Cost, MTy.getSimpleVT());

	// Keep legalizing the type.
	MTy = LK.second;
	}
	}

	Value *TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilder<> &IRB,
	bool UseTLS) const {
	// compiler-rt provides a variable with a magic name. Targets that do not
	// link with compiler-rt may also provide such a variable.
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";
	auto UnsafeStackPtr =
	dyn_cast_or_null<GlobalVariable>(M->getNamedValue(UnsafeStackPtrVar));

	Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());

	if (!UnsafeStackPtr) {
	auto TLSModel = UseTLS ?
	GlobalValue::InitialExecTLSModel :
	GlobalValue::NotThreadLocal;
	// The global variable is not defined yet, define it ourselves.
	// We use the initial-exec TLS model because we do not support the
	// variable living anywhere other than in the main executable.
	UnsafeStackPtr = new GlobalVariable(
	*M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr,
	UnsafeStackPtrVar, nullptr, TLSModel);
	} else {
	// The variable exists, check its type and attributes.
	if (UnsafeStackPtr->getValueType() != StackPtrTy)
	report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type");
	if (UseTLS != UnsafeStackPtr->isThreadLocal())
	report_fatal_error(Twine(UnsafeStackPtrVar) + " must " +
	(UseTLS ? "" : "not ") + "be thread-local");
	}
	return UnsafeStackPtr;
	}

	Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (!TM.getTargetTriple().isAndroid())
	return getDefaultSafeStackPointerLocation(IRB, true);

	// Android provides a libc function to retrieve the address of the current
	// thread's unsafe stack pointer.
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
	Value *Fn = M->getOrInsertFunction("__safestack_pointer_address",
	StackPtrTy->getPointerTo(0));
	return IRB.CreateCall(Fn);
	}

	//===----------------------------------------------------------------------===//
	// Loop Strength Reduction hooks
	//===----------------------------------------------------------------------===//

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool TargetLoweringBase::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// The default implementation of this implements a conservative RISCy, r+r and
	// r+i addr mode.

	// Allows a sign-extended 16-bit immediate field.
	if (AM.BaseOffs <= -(1LL << 16) \|\| AM.BaseOffs >= (1LL << 16)-1)
	return false;

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// Only support r+r,
	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
	return false;
	// Otherwise we have r+r or r+i.
	break;
	case 2:
	if (AM.HasBaseReg \|\| AM.BaseOffs) // 2r+r or 2r+i is not allowed.
	return false;
	// Allow 2*r as r+r.
	break;
	default: // Don't allow n * r
	return false;
	}

	return true;
	}

	//===----------------------------------------------------------------------===//
	// Stack Protector
	//===----------------------------------------------------------------------===//

	// For OpenBSD return its special guard variable. Otherwise return nullptr,
	// so that SelectionDAG handle SSP.
	Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const {
	if (getTargetMachine().getTargetTriple().isOSOpenBSD()) {
	Module &M = *IRB.GetInsertBlock()->getParent()->getParent();
	PointerType *PtrTy = Type::getInt8PtrTy(M.getContext());
	return M.getOrInsertGlobal("__guard_local", PtrTy);
	}
	return nullptr;
	}

	// Currently only support "standard" __stack_chk_guard.
	// TODO: add LOAD_STACK_GUARD support.
	void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
	M.getOrInsertGlobal("__stack_chk_guard", Type::getInt8PtrTy(M.getContext()));
	}

	// Currently only support "standard" __stack_chk_guard.
	// TODO: add LOAD_STACK_GUARD support.
	Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
	return M.getGlobalVariable("__stack_chk_guard", true);
	}

	Value *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
	return nullptr;
	}

	unsigned TargetLoweringBase::getMinimumJumpTableEntries() const {
	return MinimumJumpTableEntries;
	}

	void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) {
	MinimumJumpTableEntries = Val;
	}

	unsigned TargetLoweringBase::getMinimumJumpTableDensity(bool OptForSize) const {
	return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;
	}

	unsigned TargetLoweringBase::getMaximumJumpTableSize() const {
	return MaximumJumpTableSize;
	}

	void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {
	MaximumJumpTableSize = Val;
	}

	//===----------------------------------------------------------------------===//
	// Reciprocal Estimates
	//===----------------------------------------------------------------------===//

	/// Get the reciprocal estimate attribute string for a function that will
	/// override the target defaults.
	static StringRef getRecipEstimateForFunc(MachineFunction &MF) {
	const Function &F = MF.getFunction();
	return F.getFnAttribute("reciprocal-estimates").getValueAsString();
	}

	/// Construct a string for the given reciprocal operation of the given type.
	/// This string should match the corresponding option to the front-end's
	/// "-mrecip" flag assuming those strings have been passed through in an
	/// attribute string. For example, "vec-divf" for a division of a vXf32.
	static std::string getReciprocalOpName(bool IsSqrt, EVT VT) {
	std::string Name = VT.isVector() ? "vec-" : "";

	Name += IsSqrt ? "sqrt" : "div";

	// TODO: Handle "half" or other float types?
	if (VT.getScalarType() == MVT::f64) {
	Name += "d";
	} else {
	assert(VT.getScalarType() == MVT::f32 &&
	"Unexpected FP type for reciprocal estimate");
	Name += "f";
	}

	return Name;
	}

	/// Return the character position and value (a single numeric character) of a
	/// customized refinement operation in the input string if it exists. Return
	/// false if there is no customized refinement step count.
	static bool parseRefinementStep(StringRef In, size_t &Position,
	uint8_t &Value) {
	const char RefStepToken = ':';
	Position = In.find(RefStepToken);
	if (Position == StringRef::npos)
	return false;

	StringRef RefStepString = In.substr(Position + 1);
	// Allow exactly one numeric character for the additional refinement
	// step parameter.
	if (RefStepString.size() == 1) {
	char RefStepChar = RefStepString[0];
	if (RefStepChar >= '0' && RefStepChar <= '9') {
	Value = RefStepChar - '0';
	return true;
	}
	}
	report_fatal_error("Invalid refinement step for -recip.");
	}

	/// For the input attribute string, return one of the ReciprocalEstimate enum
	/// status values (enabled, disabled, or not specified) for this operation on
	/// the specified data type.
	static int getOpEnabled(bool IsSqrt, EVT VT, StringRef Override) {
	if (Override.empty())
	return TargetLoweringBase::ReciprocalEstimate::Unspecified;

	SmallVector<StringRef, 4> OverrideVector;
	SplitString(Override, OverrideVector, ",");
	unsigned NumArgs = OverrideVector.size();

	// Check if "all", "none", or "default" was specified.
	if (NumArgs == 1) {
	// Look for an optional setting of the number of refinement steps needed
	// for this type of reciprocal operation.
	size_t RefPos;
	uint8_t RefSteps;
	if (parseRefinementStep(Override, RefPos, RefSteps)) {
	// Split the string for further processing.
	Override = Override.substr(0, RefPos);
	}

	// All reciprocal types are enabled.
	if (Override == "all")
	return TargetLoweringBase::ReciprocalEstimate::Enabled;

	// All reciprocal types are disabled.
	if (Override == "none")
	return TargetLoweringBase::ReciprocalEstimate::Disabled;

	// Target defaults for enablement are used.
	if (Override == "default")
	return TargetLoweringBase::ReciprocalEstimate::Unspecified;
	}

	// The attribute string may omit the size suffix ('f'/'d').
	std::string VTName = getReciprocalOpName(IsSqrt, VT);
	std::string VTNameNoSize = VTName;
	VTNameNoSize.pop_back();
	static const char DisabledPrefix = '!';

	for (StringRef RecipType : OverrideVector) {
	size_t RefPos;
	uint8_t RefSteps;
	if (parseRefinementStep(RecipType, RefPos, RefSteps))
	RecipType = RecipType.substr(0, RefPos);

	// Ignore the disablement token for string matching.
	bool IsDisabled = RecipType[0] == DisabledPrefix;
	if (IsDisabled)
	RecipType = RecipType.substr(1);

	if (RecipType.equals(VTName) \|\| RecipType.equals(VTNameNoSize))
	return IsDisabled ? TargetLoweringBase::ReciprocalEstimate::Disabled
	: TargetLoweringBase::ReciprocalEstimate::Enabled;
	}

	return TargetLoweringBase::ReciprocalEstimate::Unspecified;
	}

	/// For the input attribute string, return the customized refinement step count
	/// for this operation on the specified data type. If the step count does not
	/// exist, return the ReciprocalEstimate enum value for unspecified.
	static int getOpRefinementSteps(bool IsSqrt, EVT VT, StringRef Override) {
	if (Override.empty())
	return TargetLoweringBase::ReciprocalEstimate::Unspecified;

	SmallVector<StringRef, 4> OverrideVector;
	SplitString(Override, OverrideVector, ",");
	unsigned NumArgs = OverrideVector.size();

	// Check if "all", "default", or "none" was specified.
	if (NumArgs == 1) {
	// Look for an optional setting of the number of refinement steps needed
	// for this type of reciprocal operation.
	size_t RefPos;
	uint8_t RefSteps;
	if (!parseRefinementStep(Override, RefPos, RefSteps))
	return TargetLoweringBase::ReciprocalEstimate::Unspecified;

	// Split the string for further processing.
	Override = Override.substr(0, RefPos);
	assert(Override != "none" &&
	"Disabled reciprocals, but specifed refinement steps?");

	// If this is a general override, return the specified number of steps.
	if (Override == "all" \|\| Override == "default")
	return RefSteps;
	}

	// The attribute string may omit the size suffix ('f'/'d').
	std::string VTName = getReciprocalOpName(IsSqrt, VT);
	std::string VTNameNoSize = VTName;
	VTNameNoSize.pop_back();

	for (StringRef RecipType : OverrideVector) {
	size_t RefPos;
	uint8_t RefSteps;
	if (!parseRefinementStep(RecipType, RefPos, RefSteps))
	continue;

	RecipType = RecipType.substr(0, RefPos);
	if (RecipType.equals(VTName) \|\| RecipType.equals(VTNameNoSize))
	return RefSteps;
	}

	return TargetLoweringBase::ReciprocalEstimate::Unspecified;
	}

	int TargetLoweringBase::getRecipEstimateSqrtEnabled(EVT VT,
	MachineFunction &MF) const {
	return getOpEnabled(true, VT, getRecipEstimateForFunc(MF));
	}

	int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT,
	MachineFunction &MF) const {
	return getOpEnabled(false, VT, getRecipEstimateForFunc(MF));
	}

	int TargetLoweringBase::getSqrtRefinementSteps(EVT VT,
	MachineFunction &MF) const {
	return getOpRefinementSteps(true, VT, getRecipEstimateForFunc(MF));
	}

	int TargetLoweringBase::getDivRefinementSteps(EVT VT,
	MachineFunction &MF) const {
	return getOpRefinementSteps(false, VT, getRecipEstimateForFunc(MF));
	}

	void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const {
	MF.getRegInfo().freezeReservedRegs(MF);
	}
	Index: vendor/llvm/dist-release_60/lib/Linker/IRMover.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Linker/IRMover.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Linker/IRMover.cpp (revision 328362)
	@@ -1,1459 +1,1464 @@
	//===- lib/Linker/IRMover.cpp ---------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Linker/IRMover.h"
	#include "LinkDiagnosticInfo.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/IR/GVMaterializer.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/TypeFinder.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Transforms/Utils/Cloning.h"
	#include <utility>
	using namespace llvm;

	//===----------------------------------------------------------------------===//
	// TypeMap implementation.
	//===----------------------------------------------------------------------===//

	namespace {
	class TypeMapTy : public ValueMapTypeRemapper {
	/// This is a mapping from a source type to a destination type to use.
	DenseMap<Type , Type > MappedTypes;

	/// When checking to see if two subgraphs are isomorphic, we speculatively
	/// add types to MappedTypes, but keep track of them here in case we need to
	/// roll back.
	SmallVector<Type *, 16> SpeculativeTypes;

	SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;

	/// This is a list of non-opaque structs in the source module that are mapped
	/// to an opaque struct in the destination module.
	SmallVector<StructType *, 16> SrcDefinitionsToResolve;

	/// This is the set of opaque types in the destination modules who are
	/// getting a body from the source module.
	SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;

	public:
	TypeMapTy(IRMover::IdentifiedStructTypeSet &DstStructTypesSet)
	: DstStructTypesSet(DstStructTypesSet) {}

	IRMover::IdentifiedStructTypeSet &DstStructTypesSet;
	/// Indicate that the specified type in the destination module is conceptually
	/// equivalent to the specified type in the source module.
	void addTypeMapping(Type DstTy, Type SrcTy);

	/// Produce a body for an opaque type in the dest module from a type
	/// definition in the source module.
	void linkDefinedTypeBodies();

	/// Return the mapped type to use for the specified input type from the
	/// source module.
	Type get(Type SrcTy);
	Type get(Type SrcTy, SmallPtrSet<StructType *, 8> &Visited);

	void finishType(StructType DTy, StructType STy, ArrayRef<Type *> ETypes);

	FunctionType get(FunctionType T) {
	return cast<FunctionType>(get((Type *)T));
	}

	private:
	Type remapType(Type SrcTy) override { return get(SrcTy); }

	bool areTypesIsomorphic(Type DstTy, Type SrcTy);
	};
	}

	void TypeMapTy::addTypeMapping(Type DstTy, Type SrcTy) {
	assert(SpeculativeTypes.empty());
	assert(SpeculativeDstOpaqueTypes.empty());

	// Check to see if these types are recursively isomorphic and establish a
	// mapping between them if so.
	if (!areTypesIsomorphic(DstTy, SrcTy)) {
	// Oops, they aren't isomorphic. Just discard this request by rolling out
	// any speculative mappings we've established.
	for (Type *Ty : SpeculativeTypes)
	MappedTypes.erase(Ty);

	SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
	SpeculativeDstOpaqueTypes.size());
	for (StructType *Ty : SpeculativeDstOpaqueTypes)
	DstResolvedOpaqueTypes.erase(Ty);
	} else {
	for (Type *Ty : SpeculativeTypes)
	if (auto *STy = dyn_cast<StructType>(Ty))
	if (STy->hasName())
	STy->setName("");
	}
	SpeculativeTypes.clear();
	SpeculativeDstOpaqueTypes.clear();
	}

	/// Recursively walk this pair of types, returning true if they are isomorphic,
	/// false if they are not.
	bool TypeMapTy::areTypesIsomorphic(Type DstTy, Type SrcTy) {
	// Two types with differing kinds are clearly not isomorphic.
	if (DstTy->getTypeID() != SrcTy->getTypeID())
	return false;

	// If we have an entry in the MappedTypes table, then we have our answer.
	Type *&Entry = MappedTypes[SrcTy];
	if (Entry)
	return Entry == DstTy;

	// Two identical types are clearly isomorphic. Remember this
	// non-speculatively.
	if (DstTy == SrcTy) {
	Entry = DstTy;
	return true;
	}

	// Okay, we have two types with identical kinds that we haven't seen before.

	// If this is an opaque struct type, special case it.
	if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
	// Mapping an opaque type to any struct, just keep the dest struct.
	if (SSTy->isOpaque()) {
	Entry = DstTy;
	SpeculativeTypes.push_back(SrcTy);
	return true;
	}

	// Mapping a non-opaque source type to an opaque dest. If this is the first
	// type that we're mapping onto this destination type then we succeed. Keep
	// the dest, but fill it in later. If this is the second (different) type
	// that we're trying to map onto the same opaque type then we fail.
	if (cast<StructType>(DstTy)->isOpaque()) {
	// We can only map one source type onto the opaque destination type.
	if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
	return false;
	SrcDefinitionsToResolve.push_back(SSTy);
	SpeculativeTypes.push_back(SrcTy);
	SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
	Entry = DstTy;
	return true;
	}
	}

	// If the number of subtypes disagree between the two types, then we fail.
	if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
	return false;

	// Fail if any of the extra properties (e.g. array size) of the type disagree.
	if (isa<IntegerType>(DstTy))
	return false; // bitwidth disagrees.
	if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
	if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
	return false;

	} else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
	if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
	return false;
	} else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
	StructType *SSTy = cast<StructType>(SrcTy);
	if (DSTy->isLiteral() != SSTy->isLiteral() \|\|
	DSTy->isPacked() != SSTy->isPacked())
	return false;
	} else if (auto *DSeqTy = dyn_cast<SequentialType>(DstTy)) {
	if (DSeqTy->getNumElements() !=
	cast<SequentialType>(SrcTy)->getNumElements())
	return false;
	}

	// Otherwise, we speculate that these two types will line up and recursively
	// check the subelements.
	Entry = DstTy;
	SpeculativeTypes.push_back(SrcTy);

	for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
	if (!areTypesIsomorphic(DstTy->getContainedType(I),
	SrcTy->getContainedType(I)))
	return false;

	// If everything seems to have lined up, then everything is great.
	return true;
	}

	void TypeMapTy::linkDefinedTypeBodies() {
	SmallVector<Type *, 16> Elements;
	for (StructType *SrcSTy : SrcDefinitionsToResolve) {
	StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
	assert(DstSTy->isOpaque());

	// Map the body of the source type over to a new body for the dest type.
	Elements.resize(SrcSTy->getNumElements());
	for (unsigned I = 0, E = Elements.size(); I != E; ++I)
	Elements[I] = get(SrcSTy->getElementType(I));

	DstSTy->setBody(Elements, SrcSTy->isPacked());
	DstStructTypesSet.switchToNonOpaque(DstSTy);
	}
	SrcDefinitionsToResolve.clear();
	DstResolvedOpaqueTypes.clear();
	}

	void TypeMapTy::finishType(StructType DTy, StructType STy,
	ArrayRef<Type *> ETypes) {
	DTy->setBody(ETypes, STy->isPacked());

	// Steal STy's name.
	if (STy->hasName()) {
	SmallString<16> TmpName = STy->getName();
	STy->setName("");
	DTy->setName(TmpName);
	}

	DstStructTypesSet.addNonOpaque(DTy);
	}

	Type TypeMapTy::get(Type Ty) {
	SmallPtrSet<StructType *, 8> Visited;
	return get(Ty, Visited);
	}

	Type TypeMapTy::get(Type Ty, SmallPtrSet<StructType *, 8> &Visited) {
	// If we already have an entry for this type, return it.
	Type **Entry = &MappedTypes[Ty];
	if (*Entry)
	return *Entry;

	// These are types that LLVM itself will unique.
	bool IsUniqued = !isa<StructType>(Ty) \|\| cast<StructType>(Ty)->isLiteral();

	#ifndef NDEBUG
	if (!IsUniqued) {
	for (auto &Pair : MappedTypes) {
	assert(!(Pair.first != Ty && Pair.second == Ty) &&
	"mapping to a source type");
	}
	}
	#endif

	if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
	StructType *DTy = StructType::create(Ty->getContext());
	return *Entry = DTy;
	}

	// If this is not a recursive type, then just map all of the elements and
	// then rebuild the type from inside out.
	SmallVector<Type *, 4> ElementTypes;

	// If there are no element types to map, then the type is itself. This is
	// true for the anonymous {} struct, things like 'float', integers, etc.
	if (Ty->getNumContainedTypes() == 0 && IsUniqued)
	return *Entry = Ty;

	// Remap all of the elements, keeping track of whether any of them change.
	bool AnyChange = false;
	ElementTypes.resize(Ty->getNumContainedTypes());
	for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
	ElementTypes[I] = get(Ty->getContainedType(I), Visited);
	AnyChange \|= ElementTypes[I] != Ty->getContainedType(I);
	}

	// If we found our type while recursively processing stuff, just use it.
	Entry = &MappedTypes[Ty];
	if (*Entry) {
	if (auto DTy = dyn_cast<StructType>(Entry)) {
	if (DTy->isOpaque()) {
	auto *STy = cast<StructType>(Ty);
	finishType(DTy, STy, ElementTypes);
	}
	}
	return *Entry;
	}

	// If all of the element types mapped directly over and the type is not
	// a named struct, then the type is usable as-is.
	if (!AnyChange && IsUniqued)
	return *Entry = Ty;

	// Otherwise, rebuild a modified type.
	switch (Ty->getTypeID()) {
	default:
	llvm_unreachable("unknown derived type to remap");
	case Type::ArrayTyID:
	return *Entry = ArrayType::get(ElementTypes[0],
	cast<ArrayType>(Ty)->getNumElements());
	case Type::VectorTyID:
	return *Entry = VectorType::get(ElementTypes[0],
	cast<VectorType>(Ty)->getNumElements());
	case Type::PointerTyID:
	return *Entry = PointerType::get(ElementTypes[0],
	cast<PointerType>(Ty)->getAddressSpace());
	case Type::FunctionTyID:
	return *Entry = FunctionType::get(ElementTypes[0],
	makeArrayRef(ElementTypes).slice(1),
	cast<FunctionType>(Ty)->isVarArg());
	case Type::StructTyID: {
	auto *STy = cast<StructType>(Ty);
	bool IsPacked = STy->isPacked();
	if (IsUniqued)
	return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);

	// If the type is opaque, we can just use it directly.
	if (STy->isOpaque()) {
	DstStructTypesSet.addOpaque(STy);
	return *Entry = Ty;
	}

	if (StructType *OldT =
	DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
	STy->setName("");
	return *Entry = OldT;
	}

	if (!AnyChange) {
	DstStructTypesSet.addNonOpaque(STy);
	return *Entry = Ty;
	}

	StructType *DTy = StructType::create(Ty->getContext());
	finishType(DTy, STy, ElementTypes);
	return *Entry = DTy;
	}
	}
	}

	LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
	const Twine &Msg)
	: DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
	void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }

	//===----------------------------------------------------------------------===//
	// IRLinker implementation.
	//===----------------------------------------------------------------------===//

	namespace {
	class IRLinker;

	/// Creates prototypes for functions that are lazily linked on the fly. This
	/// speeds up linking for modules with many/ lazily linked functions of which
	/// few get used.
	class GlobalValueMaterializer final : public ValueMaterializer {
	IRLinker &TheIRLinker;

	public:
	GlobalValueMaterializer(IRLinker &TheIRLinker) : TheIRLinker(TheIRLinker) {}
	Value materialize(Value V) override;
	};

	class LocalValueMaterializer final : public ValueMaterializer {
	IRLinker &TheIRLinker;

	public:
	LocalValueMaterializer(IRLinker &TheIRLinker) : TheIRLinker(TheIRLinker) {}
	Value materialize(Value V) override;
	};

	/// Type of the Metadata map in \a ValueToValueMapTy.
	typedef DenseMap<const Metadata *, TrackingMDRef> MDMapT;

	/// This is responsible for keeping track of the state used for moving data
	/// from SrcM to DstM.
	class IRLinker {
	Module &DstM;
	std::unique_ptr<Module> SrcM;

	/// See IRMover::move().
	std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor;

	TypeMapTy TypeMap;
	GlobalValueMaterializer GValMaterializer;
	LocalValueMaterializer LValMaterializer;

	/// A metadata map that's shared between IRLinker instances.
	MDMapT &SharedMDs;

	/// Mapping of values from what they used to be in Src, to what they are now
	/// in DstM. ValueToValueMapTy is a ValueMap, which involves some overhead
	/// due to the use of Value handles which the Linker doesn't actually need,
	/// but this allows us to reuse the ValueMapper code.
	ValueToValueMapTy ValueMap;
	ValueToValueMapTy AliasValueMap;

	DenseSet<GlobalValue *> ValuesToLink;
	std::vector<GlobalValue *> Worklist;

	void maybeAdd(GlobalValue *GV) {
	if (ValuesToLink.insert(GV).second)
	Worklist.push_back(GV);
	}

	/// Whether we are importing globals for ThinLTO, as opposed to linking the
	/// source module. If this flag is set, it means that we can rely on some
	/// other object file to define any non-GlobalValue entities defined by the
	/// source module. This currently causes us to not link retained types in
	/// debug info metadata and module inline asm.
	bool IsPerformingImport;

	/// Set to true when all global value body linking is complete (including
	/// lazy linking). Used to prevent metadata linking from creating new
	/// references.
	bool DoneLinkingBodies = false;

	/// The Error encountered during materialization. We use an Optional here to
	/// avoid needing to manage an unconsumed success value.
	Optional<Error> FoundError;
	void setError(Error E) {
	if (E)
	FoundError = std::move(E);
	}

	/// Most of the errors produced by this module are inconvertible StringErrors.
	/// This convenience function lets us return one of those more easily.
	Error stringErr(const Twine &T) {
	return make_error<StringError>(T, inconvertibleErrorCode());
	}

	/// Entry point for mapping values and alternate context for mapping aliases.
	ValueMapper Mapper;
	unsigned AliasMCID;

	/// Handles cloning of a global values from the source module into
	/// the destination module, including setting the attributes and visibility.
	GlobalValue copyGlobalValueProto(const GlobalValue SGV, bool ForDefinition);

	void emitWarning(const Twine &Message) {
	SrcM->getContext().diagnose(LinkDiagnosticInfo(DS_Warning, Message));
	}

	/// Given a global in the source module, return the global in the
	/// destination module that is being linked to, if any.
	GlobalValue getLinkedToGlobal(const GlobalValue SrcGV) {
	// If the source has no name it can't link. If it has local linkage,
	// there is no name match-up going on.
	if (!SrcGV->hasName() \|\| SrcGV->hasLocalLinkage())
	return nullptr;

	// Otherwise see if we have a match in the destination module's symtab.
	GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName());
	if (!DGV)
	return nullptr;

	// If we found a global with the same name in the dest module, but it has
	// internal linkage, we are really not doing any linkage here.
	if (DGV->hasLocalLinkage())
	return nullptr;

	// Otherwise, we do in fact link to the destination global.
	return DGV;
	}

	void computeTypeMapping();

	Expected<Constant > linkAppendingVarProto(GlobalVariable DstGV,
	const GlobalVariable *SrcGV);

	/// Given the GlobaValue \p SGV in the source module, and the matching
	/// GlobalValue \p DGV (if any), return true if the linker will pull \p SGV
	/// into the destination module.
	///
	/// Note this code may call the client-provided \p AddLazyFor.
	bool shouldLink(GlobalValue *DGV, GlobalValue &SGV);
	Expected<Constant > linkGlobalValueProto(GlobalValue GV, bool ForAlias);

	Error linkModuleFlagsMetadata();

	void linkGlobalVariable(GlobalVariable &Dst, GlobalVariable &Src);
	Error linkFunctionBody(Function &Dst, Function &Src);
	void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
	Error linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);

	/// Functions that take care of cloning a specific global value type
	/// into the destination module.
	GlobalVariable copyGlobalVariableProto(const GlobalVariable SGVar);
	Function copyFunctionProto(const Function SF);
	GlobalValue copyGlobalAliasProto(const GlobalAlias SGA);

	/// When importing for ThinLTO, prevent importing of types listed on
	/// the DICompileUnit that we don't need a copy of in the importing
	/// module.
	void prepareCompileUnitsForImport();
	void linkNamedMDNodes();

	public:
	IRLinker(Module &DstM, MDMapT &SharedMDs,
	IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
	ArrayRef<GlobalValue *> ValuesToLink,
	std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
	bool IsPerformingImport)
	: DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
	TypeMap(Set), GValMaterializer(this), LValMaterializer(this),
	SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
	Mapper(ValueMap, RF_MoveDistinctMDs \| RF_IgnoreMissingLocals, &TypeMap,
	&GValMaterializer),
	AliasMCID(Mapper.registerAlternateMappingContext(AliasValueMap,
	&LValMaterializer)) {
	ValueMap.getMDMap() = std::move(SharedMDs);
	for (GlobalValue *GV : ValuesToLink)
	maybeAdd(GV);
	if (IsPerformingImport)
	prepareCompileUnitsForImport();
	}
	~IRLinker() { SharedMDs = std::move(*ValueMap.getMDMap()); }

	Error run();
	Value materialize(Value V, bool ForAlias);
	};
	}

	/// The LLVM SymbolTable class autorenames globals that conflict in the symbol
	/// table. This is good for all clients except for us. Go through the trouble
	/// to force this back.
	static void forceRenaming(GlobalValue *GV, StringRef Name) {
	// If the global doesn't force its name or if it already has the right name,
	// there is nothing for us to do.
	if (GV->hasLocalLinkage() \|\| GV->getName() == Name)
	return;

	Module *M = GV->getParent();

	// If there is a conflict, rename the conflict.
	if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
	GV->takeName(ConflictGV);
	ConflictGV->setName(Name); // This will cause ConflictGV to get renamed
	assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
	} else {
	GV->setName(Name); // Force the name back
	}
	}

	Value GlobalValueMaterializer::materialize(Value SGV) {
	return TheIRLinker.materialize(SGV, false);
	}

	Value LocalValueMaterializer::materialize(Value SGV) {
	return TheIRLinker.materialize(SGV, true);
	}

	Value IRLinker::materialize(Value V, bool ForAlias) {
	auto *SGV = dyn_cast<GlobalValue>(V);
	if (!SGV)
	return nullptr;

	Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForAlias);
	if (!NewProto) {
	setError(NewProto.takeError());
	return nullptr;
	}
	if (!*NewProto)
	return nullptr;

	GlobalValue New = dyn_cast<GlobalValue>(NewProto);
	if (!New)
	return *NewProto;

	// If we already created the body, just return.
	if (auto *F = dyn_cast<Function>(New)) {
	if (!F->isDeclaration())
	return New;
	} else if (auto *V = dyn_cast<GlobalVariable>(New)) {
	if (V->hasInitializer() \|\| V->hasAppendingLinkage())
	return New;
	} else {
	auto *A = cast<GlobalAlias>(New);
	if (A->getAliasee())
	return New;
	}

	// When linking a global for an alias, it will always be linked. However we
	// need to check if it was not already scheduled to satisfy a reference from a
	// regular global value initializer. We know if it has been schedule if the
	// "New" GlobalValue that is mapped here for the alias is the same as the one
	// already mapped. If there is an entry in the ValueMap but the value is
	// different, it means that the value already had a definition in the
	// destination module (linkonce for instance), but we need a new definition
	// for the alias ("New" will be different.
	if (ForAlias && ValueMap.lookup(SGV) == New)
	return New;

	if (ForAlias \|\| shouldLink(New, *SGV))
	setError(linkGlobalValueBody(New, SGV));

	return New;
	}

	/// Loop through the global variables in the src module and merge them into the
	/// dest module.
	GlobalVariable IRLinker::copyGlobalVariableProto(const GlobalVariable SGVar) {
	// No linking to be performed or linking from the source: simply create an
	// identical version of the symbol over in the dest module... the
	// initializer will be filled in later by LinkGlobalInits.
	GlobalVariable *NewDGV =
	new GlobalVariable(DstM, TypeMap.get(SGVar->getValueType()),
	SGVar->isConstant(), GlobalValue::ExternalLinkage,
	/init/ nullptr, SGVar->getName(),
	/insertbefore/ nullptr, SGVar->getThreadLocalMode(),
	SGVar->getType()->getAddressSpace());
	NewDGV->setAlignment(SGVar->getAlignment());
	NewDGV->copyAttributesFrom(SGVar);
	return NewDGV;
	}

	/// Link the function in the source module into the destination module if
	/// needed, setting up mapping information.
	Function IRLinker::copyFunctionProto(const Function SF) {
	// If there is no linkage to be performed or we are linking from the source,
	// bring SF over.
	auto *F =
	Function::Create(TypeMap.get(SF->getFunctionType()),
	GlobalValue::ExternalLinkage, SF->getName(), &DstM);
	F->copyAttributesFrom(SF);
	return F;
	}

	/// Set up prototypes for any aliases that come over from the source module.
	GlobalValue IRLinker::copyGlobalAliasProto(const GlobalAlias SGA) {
	// If there is no linkage to be performed or we're linking from the source,
	// bring over SGA.
	auto *Ty = TypeMap.get(SGA->getValueType());
	auto *GA =
	GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
	GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
	GA->copyAttributesFrom(SGA);
	return GA;
	}

	GlobalValue IRLinker::copyGlobalValueProto(const GlobalValue SGV,
	bool ForDefinition) {
	GlobalValue *NewGV;
	if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
	NewGV = copyGlobalVariableProto(SGVar);
	} else if (auto *SF = dyn_cast<Function>(SGV)) {
	NewGV = copyFunctionProto(SF);
	} else {
	if (ForDefinition)
	NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
	else if (SGV->getValueType()->isFunctionTy())
	NewGV =
	Function::Create(cast<FunctionType>(TypeMap.get(SGV->getValueType())),
	GlobalValue::ExternalLinkage, SGV->getName(), &DstM);
	else
	NewGV = new GlobalVariable(
	DstM, TypeMap.get(SGV->getValueType()),
	/isConstant/ false, GlobalValue::ExternalLinkage,
	/init/ nullptr, SGV->getName(),
	/insertbefore/ nullptr, SGV->getThreadLocalMode(),
	SGV->getType()->getAddressSpace());
	}

	if (ForDefinition)
	NewGV->setLinkage(SGV->getLinkage());
	else if (SGV->hasExternalWeakLinkage())
	NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);

	if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
	// Metadata for global variables and function declarations is copied eagerly.
	if (isa<GlobalVariable>(SGV) \|\| SGV->isDeclaration())
	NewGO->copyMetadata(cast<GlobalObject>(SGV), 0);
	}

	// Remove these copied constants in case this stays a declaration, since
	// they point to the source module. If the def is linked the values will
	// be mapped in during linkFunctionBody.
	if (auto *NewF = dyn_cast<Function>(NewGV)) {
	NewF->setPersonalityFn(nullptr);
	NewF->setPrefixData(nullptr);
	NewF->setPrologueData(nullptr);
	}

	return NewGV;
	}

	/// Loop over all of the linked values to compute type mappings. For example,
	/// if we link "extern Foo x" and "Foo x = NULL", then we have two struct
	/// types 'Foo' but one got renamed when the module was loaded into the same
	/// LLVMContext.
	void IRLinker::computeTypeMapping() {
	for (GlobalValue &SGV : SrcM->globals()) {
	GlobalValue *DGV = getLinkedToGlobal(&SGV);
	if (!DGV)
	continue;

	if (!DGV->hasAppendingLinkage() \|\| !SGV.hasAppendingLinkage()) {
	TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
	continue;
	}

	// Unify the element type of appending arrays.
	ArrayType *DAT = cast<ArrayType>(DGV->getValueType());
	ArrayType *SAT = cast<ArrayType>(SGV.getValueType());
	TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
	}

	for (GlobalValue &SGV : *SrcM)
	if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
	TypeMap.addTypeMapping(DGV->getType(), SGV.getType());

	for (GlobalValue &SGV : SrcM->aliases())
	if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
	TypeMap.addTypeMapping(DGV->getType(), SGV.getType());

	// Incorporate types by name, scanning all the types in the source module.
	// At this point, the destination module may have a type "%foo = { i32 }" for
	// example. When the source module got loaded into the same LLVMContext, if
	// it had the same type, it would have been renamed to "%foo.42 = { i32 }".
	std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes();
	for (StructType *ST : Types) {
	if (!ST->hasName())
	continue;

	if (TypeMap.DstStructTypesSet.hasType(ST)) {
	// This is actually a type from the destination module.
	// getIdentifiedStructTypes() can have found it by walking debug info
	// metadata nodes, some of which get linked by name when ODR Type Uniquing
	// is enabled on the Context, from the source to the destination module.
	continue;
	}

	// Check to see if there is a dot in the name followed by a digit.
	size_t DotPos = ST->getName().rfind('.');
	if (DotPos == 0 \|\| DotPos == StringRef::npos \|\|
	ST->getName().back() == '.' \|\|
	!isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
	continue;

	// Check to see if the destination module has a struct with the prefix name.
	StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
	if (!DST)
	continue;

	// Don't use it if this actually came from the source module. They're in
	// the same LLVMContext after all. Also don't use it unless the type is
	// actually used in the destination module. This can happen in situations
	// like this:
	//
	// Module A Module B
	// -------- --------
	// %Z = type { %A } %B = type { %C.1 }
	// %A = type { %B.1, [7 x i8] } %C.1 = type { i8* }
	// %B.1 = type { %C } %A.2 = type { %B.3, [5 x i8] }
	// %C = type { i8* } %B.3 = type { %C.1 }
	//
	// When we link Module B with Module A, the '%B' in Module B is
	// used. However, that would then use '%C.1'. But when we process '%C.1',
	// we prefer to take the '%C' version. So we are then left with both
	// '%C.1' and '%C' being used for the same types. This leads to some
	// variables using one type and some using the other.
	if (TypeMap.DstStructTypesSet.hasType(DST))
	TypeMap.addTypeMapping(DST, ST);
	}

	// Now that we have discovered all of the type equivalences, get a body for
	// any 'opaque' types in the dest module that are now resolved.
	TypeMap.linkDefinedTypeBodies();
	}

	static void getArrayElements(const Constant *C,
	SmallVectorImpl<Constant *> &Dest) {
	unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();

	for (unsigned i = 0; i != NumElements; ++i)
	Dest.push_back(C->getAggregateElement(i));
	}

	/// If there were any appending global variables, link them together now.
	Expected<Constant *>
	IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
	const GlobalVariable *SrcGV) {
	Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getValueType()))
	->getElementType();

	// FIXME: This upgrade is done during linking to support the C API. Once the
	// old form is deprecated, we should move this upgrade to
	// llvm::UpgradeGlobalVariable() and simplify the logic here and in
	// Mapper::mapAppendingVariable() in ValueMapper.cpp.
	StringRef Name = SrcGV->getName();
	bool IsNewStructor = false;
	bool IsOldStructor = false;
	if (Name == "llvm.global_ctors" \|\| Name == "llvm.global_dtors") {
	if (cast<StructType>(EltTy)->getNumElements() == 3)
	IsNewStructor = true;
	else
	IsOldStructor = true;
	}

	PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo();
	if (IsOldStructor) {
	auto &ST = *cast<StructType>(EltTy);
	Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
	EltTy = StructType::get(SrcGV->getContext(), Tys, false);
	}

	uint64_t DstNumElements = 0;
	if (DstGV) {
	ArrayType *DstTy = cast<ArrayType>(DstGV->getValueType());
	DstNumElements = DstTy->getNumElements();

	if (!SrcGV->hasAppendingLinkage() \|\| !DstGV->hasAppendingLinkage())
	return stringErr(
	"Linking globals named '" + SrcGV->getName() +
	"': can only link appending global with another appending "
	"global!");

	// Check to see that they two arrays agree on type.
	if (EltTy != DstTy->getElementType())
	return stringErr("Appending variables with different element types!");
	if (DstGV->isConstant() != SrcGV->isConstant())
	return stringErr("Appending variables linked with different const'ness!");

	if (DstGV->getAlignment() != SrcGV->getAlignment())
	return stringErr(
	"Appending variables with different alignment need to be linked!");

	if (DstGV->getVisibility() != SrcGV->getVisibility())
	return stringErr(
	"Appending variables with different visibility need to be linked!");

	if (DstGV->hasGlobalUnnamedAddr() != SrcGV->hasGlobalUnnamedAddr())
	return stringErr(
	"Appending variables with different unnamed_addr need to be linked!");

	if (DstGV->getSection() != SrcGV->getSection())
	return stringErr(
	"Appending variables with different section name need to be linked!");
	}

	SmallVector<Constant *, 16> SrcElements;
	getArrayElements(SrcGV->getInitializer(), SrcElements);

	if (IsNewStructor) {
	auto It = remove_if(SrcElements, [this](Constant *E) {
	auto *Key =
	dyn_cast<GlobalValue>(E->getAggregateElement(2)->stripPointerCasts());
	if (!Key)
	return false;
	GlobalValue *DGV = getLinkedToGlobal(Key);
	return !shouldLink(DGV, *Key);
	});
	SrcElements.erase(It, SrcElements.end());
	}
	uint64_t NewSize = DstNumElements + SrcElements.size();
	ArrayType *NewType = ArrayType::get(EltTy, NewSize);

	// Create the new global variable.
	GlobalVariable *NG = new GlobalVariable(
	DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
	/init/ nullptr, /name/ "", DstGV, SrcGV->getThreadLocalMode(),
	SrcGV->getType()->getAddressSpace());

	NG->copyAttributesFrom(SrcGV);
	forceRenaming(NG, SrcGV->getName());

	Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));

	Mapper.scheduleMapAppendingVariable(*NG,
	DstGV ? DstGV->getInitializer() : nullptr,
	IsOldStructor, SrcElements);

	// Replace any uses of the two global variables with uses of the new
	// global.
	if (DstGV) {
	DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
	DstGV->eraseFromParent();
	}

	return Ret;
	}

	bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
	if (ValuesToLink.count(&SGV) \|\| SGV.hasLocalLinkage())
	return true;

	if (DGV && !DGV->isDeclarationForLinker())
	return false;

	if (SGV.isDeclaration() \|\| DoneLinkingBodies)
	return false;

	// Callback to the client to give a chance to lazily add the Global to the
	// list of value to link.
	bool LazilyAdded = false;
	AddLazyFor(SGV, [this, &LazilyAdded](GlobalValue &GV) {
	maybeAdd(&GV);
	LazilyAdded = true;
	});
	return LazilyAdded;
	}

	Expected<Constant > IRLinker::linkGlobalValueProto(GlobalValue SGV,
	bool ForAlias) {
	GlobalValue *DGV = getLinkedToGlobal(SGV);

	bool ShouldLink = shouldLink(DGV, *SGV);

	// just missing from map
	if (ShouldLink) {
	auto I = ValueMap.find(SGV);
	if (I != ValueMap.end())
	return cast<Constant>(I->second);

	I = AliasValueMap.find(SGV);
	if (I != AliasValueMap.end())
	return cast<Constant>(I->second);
	}

	if (!ShouldLink && ForAlias)
	DGV = nullptr;

	// Handle the ultra special appending linkage case first.
	assert(!DGV \|\| SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
	if (SGV->hasAppendingLinkage())
	return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
	cast<GlobalVariable>(SGV));

	GlobalValue *NewGV;
	if (DGV && !ShouldLink) {
	NewGV = DGV;
	} else {
	// If we are done linking global value bodies (i.e. we are performing
	// metadata linking), don't link in the global value due to this
	// reference, simply map it to null.
	if (DoneLinkingBodies)
	return nullptr;

	NewGV = copyGlobalValueProto(SGV, ShouldLink);
	if (ShouldLink \|\| !ForAlias)
	forceRenaming(NewGV, SGV->getName());
	}

	// Overloaded intrinsics have overloaded types names as part of their
	// names. If we renamed overloaded types we should rename the intrinsic
	// as well.
	if (Function *F = dyn_cast<Function>(NewGV))
	if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F))
	NewGV = Remangled.getValue();

	if (ShouldLink \|\| ForAlias) {
	if (const Comdat *SC = SGV->getComdat()) {
	if (auto *GO = dyn_cast<GlobalObject>(NewGV)) {
	Comdat *DC = DstM.getOrInsertComdat(SC->getName());
	DC->setSelectionKind(SC->getSelectionKind());
	GO->setComdat(DC);
	}
	}
	}

	if (!ShouldLink && ForAlias)
	NewGV->setLinkage(GlobalValue::InternalLinkage);

	Constant *C = NewGV;
	- if (DGV)
	+ // Only create a bitcast if necessary. In particular, with
	+ // DebugTypeODRUniquing we may reach metadata in the destination module
	+ // containing a GV from the source module, in which case SGV will be
	+ // the same as DGV and NewGV, and TypeMap.get() will assert since it
	+ // assumes it is being invoked on a type in the source module.
	+ if (DGV && NewGV != SGV)
	C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));

	if (DGV && NewGV != DGV) {
	DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
	DGV->eraseFromParent();
	}

	return C;
	}

	/// Update the initializers in the Dest module now that all globals that may be
	/// referenced are in Dest.
	void IRLinker::linkGlobalVariable(GlobalVariable &Dst, GlobalVariable &Src) {
	// Figure out what the initializer looks like in the dest module.
	Mapper.scheduleMapGlobalInitializer(Dst, *Src.getInitializer());
	}

	/// Copy the source function over into the dest function and fix up references
	/// to values. At this point we know that Dest is an external function, and
	/// that Src is not.
	Error IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
	assert(Dst.isDeclaration() && !Src.isDeclaration());

	// Materialize if needed.
	if (Error Err = Src.materialize())
	return Err;

	// Link in the operands without remapping.
	if (Src.hasPrefixData())
	Dst.setPrefixData(Src.getPrefixData());
	if (Src.hasPrologueData())
	Dst.setPrologueData(Src.getPrologueData());
	if (Src.hasPersonalityFn())
	Dst.setPersonalityFn(Src.getPersonalityFn());

	// Copy over the metadata attachments without remapping.
	Dst.copyMetadata(&Src, 0);

	// Steal arguments and splice the body of Src into Dst.
	Dst.stealArgumentListFrom(Src);
	Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());

	// Everything has been moved over. Remap it.
	Mapper.scheduleRemapFunction(Dst);
	return Error::success();
	}

	void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
	Mapper.scheduleMapGlobalAliasee(Dst, *Src.getAliasee(), AliasMCID);
	}

	Error IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
	if (auto *F = dyn_cast<Function>(&Src))
	return linkFunctionBody(cast<Function>(Dst), *F);
	if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
	linkGlobalVariable(cast<GlobalVariable>(Dst), *GVar);
	return Error::success();
	}
	linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
	return Error::success();
	}

	void IRLinker::prepareCompileUnitsForImport() {
	NamedMDNode *SrcCompileUnits = SrcM->getNamedMetadata("llvm.dbg.cu");
	if (!SrcCompileUnits)
	return;
	// When importing for ThinLTO, prevent importing of types listed on
	// the DICompileUnit that we don't need a copy of in the importing
	// module. They will be emitted by the originating module.
	for (unsigned I = 0, E = SrcCompileUnits->getNumOperands(); I != E; ++I) {
	auto *CU = cast<DICompileUnit>(SrcCompileUnits->getOperand(I));
	assert(CU && "Expected valid compile unit");
	// Enums, macros, and retained types don't need to be listed on the
	// imported DICompileUnit. This means they will only be imported
	// if reached from the mapped IR. Do this by setting their value map
	// entries to nullptr, which will automatically prevent their importing
	// when reached from the DICompileUnit during metadata mapping.
	ValueMap.MD()[CU->getRawEnumTypes()].reset(nullptr);
	ValueMap.MD()[CU->getRawMacros()].reset(nullptr);
	ValueMap.MD()[CU->getRawRetainedTypes()].reset(nullptr);
	// If we ever start importing global variable defs, we'll need to
	// add their DIGlobalVariable to the globals list on the imported
	// DICompileUnit. Confirm none are imported, and then we can
	// map the list of global variables to nullptr.
	assert(none_of(
	ValuesToLink,
	[](const GlobalValue *GV) { return isa<GlobalVariable>(GV); }) &&
	"Unexpected importing of a GlobalVariable definition");
	ValueMap.MD()[CU->getRawGlobalVariables()].reset(nullptr);

	// Imported entities only need to be mapped in if they have local
	// scope, as those might correspond to an imported entity inside a
	// function being imported (any locally scoped imported entities that
	// don't end up referenced by an imported function will not be emitted
	// into the object). Imported entities not in a local scope
	// (e.g. on the namespace) only need to be emitted by the originating
	// module. Create a list of the locally scoped imported entities, and
	// replace the source CUs imported entity list with the new list, so
	// only those are mapped in.
	// FIXME: Locally-scoped imported entities could be moved to the
	// functions they are local to instead of listing them on the CU, and
	// we would naturally only link in those needed by function importing.
	SmallVector<TrackingMDNodeRef, 4> AllImportedModules;
	bool ReplaceImportedEntities = false;
	for (auto *IE : CU->getImportedEntities()) {
	DIScope *Scope = IE->getScope();
	assert(Scope && "Invalid Scope encoding!");
	if (isa<DILocalScope>(Scope))
	AllImportedModules.emplace_back(IE);
	else
	ReplaceImportedEntities = true;
	}
	if (ReplaceImportedEntities) {
	if (!AllImportedModules.empty())
	CU->replaceImportedEntities(MDTuple::get(
	CU->getContext(),
	SmallVector<Metadata *, 16>(AllImportedModules.begin(),
	AllImportedModules.end())));
	else
	// If there were no local scope imported entities, we can map
	// the whole list to nullptr.
	ValueMap.MD()[CU->getRawImportedEntities()].reset(nullptr);
	}
	}
	}

	/// Insert all of the named MDNodes in Src into the Dest module.
	void IRLinker::linkNamedMDNodes() {
	const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
	for (const NamedMDNode &NMD : SrcM->named_metadata()) {
	// Don't link module flags here. Do them separately.
	if (&NMD == SrcModFlags)
	continue;
	NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
	// Add Src elements into Dest node.
	for (const MDNode *Op : NMD.operands())
	DestNMD->addOperand(Mapper.mapMDNode(*Op));
	}
	}

	/// Merge the linker flags in Src into the Dest module.
	Error IRLinker::linkModuleFlagsMetadata() {
	// If the source module has no module flags, we are done.
	const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
	if (!SrcModFlags)
	return Error::success();

	// If the destination module doesn't have module flags yet, then just copy
	// over the source module's flags.
	NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
	if (DstModFlags->getNumOperands() == 0) {
	for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
	DstModFlags->addOperand(SrcModFlags->getOperand(I));

	return Error::success();
	}

	// First build a map of the existing module flags and requirements.
	DenseMap<MDString , std::pair<MDNode , unsigned>> Flags;
	SmallSetVector<MDNode *, 16> Requirements;
	for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
	MDNode *Op = DstModFlags->getOperand(I);
	ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
	MDString *ID = cast<MDString>(Op->getOperand(1));

	if (Behavior->getZExtValue() == Module::Require) {
	Requirements.insert(cast<MDNode>(Op->getOperand(2)));
	} else {
	Flags[ID] = std::make_pair(Op, I);
	}
	}

	// Merge in the flags from the source module, and also collect its set of
	// requirements.
	for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
	MDNode *SrcOp = SrcModFlags->getOperand(I);
	ConstantInt *SrcBehavior =
	mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
	MDString *ID = cast<MDString>(SrcOp->getOperand(1));
	MDNode *DstOp;
	unsigned DstIndex;
	std::tie(DstOp, DstIndex) = Flags.lookup(ID);
	unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();

	// If this is a requirement, add it and continue.
	if (SrcBehaviorValue == Module::Require) {
	// If the destination module does not already have this requirement, add
	// it.
	if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
	DstModFlags->addOperand(SrcOp);
	}
	continue;
	}

	// If there is no existing flag with this ID, just add it.
	if (!DstOp) {
	Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
	DstModFlags->addOperand(SrcOp);
	continue;
	}

	// Otherwise, perform a merge.
	ConstantInt *DstBehavior =
	mdconst::extract<ConstantInt>(DstOp->getOperand(0));
	unsigned DstBehaviorValue = DstBehavior->getZExtValue();

	auto overrideDstValue = [&]() {
	DstModFlags->setOperand(DstIndex, SrcOp);
	Flags[ID].first = SrcOp;
	};

	// If either flag has override behavior, handle it first.
	if (DstBehaviorValue == Module::Override) {
	// Diagnose inconsistent flags which both have override behavior.
	if (SrcBehaviorValue == Module::Override &&
	SrcOp->getOperand(2) != DstOp->getOperand(2))
	return stringErr("linking module flags '" + ID->getString() +
	"': IDs have conflicting override values");
	continue;
	} else if (SrcBehaviorValue == Module::Override) {
	// Update the destination flag to that of the source.
	overrideDstValue();
	continue;
	}

	// Diagnose inconsistent merge behavior types.
	if (SrcBehaviorValue != DstBehaviorValue)
	return stringErr("linking module flags '" + ID->getString() +
	"': IDs have conflicting behaviors");

	auto replaceDstValue = [&](MDNode *New) {
	Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
	MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
	DstModFlags->setOperand(DstIndex, Flag);
	Flags[ID].first = Flag;
	};

	// Perform the merge for standard behavior types.
	switch (SrcBehaviorValue) {
	case Module::Require:
	case Module::Override:
	llvm_unreachable("not possible");
	case Module::Error: {
	// Emit an error if the values differ.
	if (SrcOp->getOperand(2) != DstOp->getOperand(2))
	return stringErr("linking module flags '" + ID->getString() +
	"': IDs have conflicting values");
	continue;
	}
	case Module::Warning: {
	// Emit a warning if the values differ.
	if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
	emitWarning("linking module flags '" + ID->getString() +
	"': IDs have conflicting values");
	}
	continue;
	}
	case Module::Max: {
	ConstantInt *DstValue =
	mdconst::extract<ConstantInt>(DstOp->getOperand(2));
	ConstantInt *SrcValue =
	mdconst::extract<ConstantInt>(SrcOp->getOperand(2));
	if (SrcValue->getZExtValue() > DstValue->getZExtValue())
	overrideDstValue();
	break;
	}
	case Module::Append: {
	MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
	MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
	SmallVector<Metadata *, 8> MDs;
	MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
	MDs.append(DstValue->op_begin(), DstValue->op_end());
	MDs.append(SrcValue->op_begin(), SrcValue->op_end());

	replaceDstValue(MDNode::get(DstM.getContext(), MDs));
	break;
	}
	case Module::AppendUnique: {
	SmallSetVector<Metadata *, 16> Elts;
	MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
	MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
	Elts.insert(DstValue->op_begin(), DstValue->op_end());
	Elts.insert(SrcValue->op_begin(), SrcValue->op_end());

	replaceDstValue(MDNode::get(DstM.getContext(),
	makeArrayRef(Elts.begin(), Elts.end())));
	break;
	}
	}
	}

	// Check all of the requirements.
	for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
	MDNode *Requirement = Requirements[I];
	MDString *Flag = cast<MDString>(Requirement->getOperand(0));
	Metadata *ReqValue = Requirement->getOperand(1);

	MDNode *Op = Flags[Flag].first;
	if (!Op \|\| Op->getOperand(2) != ReqValue)
	return stringErr("linking module flags '" + Flag->getString() +
	"': does not have the required value");
	}
	return Error::success();
	}

	/// Return InlineAsm adjusted with target-specific directives if required.
	/// For ARM and Thumb, we have to add directives to select the appropriate ISA
	/// to support mixing module-level inline assembly from ARM and Thumb modules.
	static std::string adjustInlineAsm(const std::string &InlineAsm,
	const Triple &Triple) {
	if (Triple.getArch() == Triple::thumb \|\| Triple.getArch() == Triple::thumbeb)
	return ".text\n.balign 2\n.thumb\n" + InlineAsm;
	if (Triple.getArch() == Triple::arm \|\| Triple.getArch() == Triple::armeb)
	return ".text\n.balign 4\n.arm\n" + InlineAsm;
	return InlineAsm;
	}

	Error IRLinker::run() {
	// Ensure metadata materialized before value mapping.
	if (SrcM->getMaterializer())
	if (Error Err = SrcM->getMaterializer()->materializeMetadata())
	return Err;

	// Inherit the target data from the source module if the destination module
	// doesn't have one already.
	if (DstM.getDataLayout().isDefault())
	DstM.setDataLayout(SrcM->getDataLayout());

	if (SrcM->getDataLayout() != DstM.getDataLayout()) {
	emitWarning("Linking two modules of different data layouts: '" +
	SrcM->getModuleIdentifier() + "' is '" +
	SrcM->getDataLayoutStr() + "' whereas '" +
	DstM.getModuleIdentifier() + "' is '" +
	DstM.getDataLayoutStr() + "'\n");
	}

	// Copy the target triple from the source to dest if the dest's is empty.
	if (DstM.getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
	DstM.setTargetTriple(SrcM->getTargetTriple());

	Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM.getTargetTriple());

	if (!SrcM->getTargetTriple().empty()&&
	!SrcTriple.isCompatibleWith(DstTriple))
	emitWarning("Linking two modules of different target triples: " +
	SrcM->getModuleIdentifier() + "' is '" +
	SrcM->getTargetTriple() + "' whereas '" +
	DstM.getModuleIdentifier() + "' is '" + DstM.getTargetTriple() +
	"'\n");

	DstM.setTargetTriple(SrcTriple.merge(DstTriple));

	// Append the module inline asm string.
	if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
	std::string SrcModuleInlineAsm = adjustInlineAsm(SrcM->getModuleInlineAsm(),
	SrcTriple);
	if (DstM.getModuleInlineAsm().empty())
	DstM.setModuleInlineAsm(SrcModuleInlineAsm);
	else
	DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
	SrcModuleInlineAsm);
	}

	// Loop over all of the linked values to compute type mappings.
	computeTypeMapping();

	std::reverse(Worklist.begin(), Worklist.end());
	while (!Worklist.empty()) {
	GlobalValue *GV = Worklist.back();
	Worklist.pop_back();

	// Already mapped.
	if (ValueMap.find(GV) != ValueMap.end() \|\|
	AliasValueMap.find(GV) != AliasValueMap.end())
	continue;

	assert(!GV->isDeclaration());
	Mapper.mapValue(*GV);
	if (FoundError)
	return std::move(*FoundError);
	}

	// Note that we are done linking global value bodies. This prevents
	// metadata linking from creating new references.
	DoneLinkingBodies = true;
	Mapper.addFlags(RF_NullMapMissingGlobalValues);

	// Remap all of the named MDNodes in Src into the DstM module. We do this
	// after linking GlobalValues so that MDNodes that reference GlobalValues
	// are properly remapped.
	linkNamedMDNodes();

	// Merge the module flags into the DstM module.
	return linkModuleFlagsMetadata();
	}

	IRMover::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
	: ETypes(E), IsPacked(P) {}

	IRMover::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
	: ETypes(ST->elements()), IsPacked(ST->isPacked()) {}

	bool IRMover::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
	return IsPacked == That.IsPacked && ETypes == That.ETypes;
	}

	bool IRMover::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
	return !this->operator==(That);
	}

	StructType *IRMover::StructTypeKeyInfo::getEmptyKey() {
	return DenseMapInfo<StructType *>::getEmptyKey();
	}

	StructType *IRMover::StructTypeKeyInfo::getTombstoneKey() {
	return DenseMapInfo<StructType *>::getTombstoneKey();
	}

	unsigned IRMover::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
	return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
	Key.IsPacked);
	}

	unsigned IRMover::StructTypeKeyInfo::getHashValue(const StructType *ST) {
	return getHashValue(KeyTy(ST));
	}

	bool IRMover::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
	const StructType *RHS) {
	if (RHS == getEmptyKey() \|\| RHS == getTombstoneKey())
	return false;
	return LHS == KeyTy(RHS);
	}

	bool IRMover::StructTypeKeyInfo::isEqual(const StructType *LHS,
	const StructType *RHS) {
	if (RHS == getEmptyKey() \|\| RHS == getTombstoneKey())
	return LHS == RHS;
	return KeyTy(LHS) == KeyTy(RHS);
	}

	void IRMover::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
	assert(!Ty->isOpaque());
	NonOpaqueStructTypes.insert(Ty);
	}

	void IRMover::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
	assert(!Ty->isOpaque());
	NonOpaqueStructTypes.insert(Ty);
	bool Removed = OpaqueStructTypes.erase(Ty);
	(void)Removed;
	assert(Removed);
	}

	void IRMover::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
	assert(Ty->isOpaque());
	OpaqueStructTypes.insert(Ty);
	}

	StructType *
	IRMover::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
	bool IsPacked) {
	IRMover::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
	auto I = NonOpaqueStructTypes.find_as(Key);
	return I == NonOpaqueStructTypes.end() ? nullptr : *I;
	}

	bool IRMover::IdentifiedStructTypeSet::hasType(StructType *Ty) {
	if (Ty->isOpaque())
	return OpaqueStructTypes.count(Ty);
	auto I = NonOpaqueStructTypes.find(Ty);
	return I == NonOpaqueStructTypes.end() ? false : *I == Ty;
	}

	IRMover::IRMover(Module &M) : Composite(M) {
	TypeFinder StructTypes;
	StructTypes.run(M, /* OnlyNamed */ false);
	for (StructType *Ty : StructTypes) {
	if (Ty->isOpaque())
	IdentifiedStructTypes.addOpaque(Ty);
	else
	IdentifiedStructTypes.addNonOpaque(Ty);
	}
	// Self-map metadatas in the destination module. This is needed when
	// DebugTypeODRUniquing is enabled on the LLVMContext, since metadata in the
	// destination module may be reached from the source module.
	for (auto *MD : StructTypes.getVisitedMetadata()) {
	SharedMDs[MD].reset(const_cast<MDNode *>(MD));
	}
	}

	Error IRMover::move(
	std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
	std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
	bool IsPerformingImport) {
	IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
	std::move(Src), ValuesToLink, std::move(AddLazyFor),
	IsPerformingImport);
	Error E = TheIRLinker.run();
	Composite.dropTriviallyDeadConstantArrays();
	return E;
	}
	Index: vendor/llvm/dist-release_60/lib/MC/MCCodeView.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/MC/MCCodeView.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/MC/MCCodeView.cpp (revision 328362)
	@@ -1,634 +1,703 @@
	//===- MCCodeView.h - Machine Code CodeView support -------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Holds state from .cv_file and .cv_loc directives for later emission.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/MC/MCCodeView.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/DebugInfo/CodeView/CodeView.h"
	#include "llvm/DebugInfo/CodeView/Line.h"
	#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
	#include "llvm/MC/MCAsmLayout.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCObjectStreamer.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/Support/EndianStream.h"

	using namespace llvm;
	using namespace llvm::codeview;

	CodeViewContext::CodeViewContext() {}

	CodeViewContext::~CodeViewContext() {
	// If someone inserted strings into the string table but never actually
	// emitted them somewhere, clean up the fragment.
	if (!InsertedStrTabFragment)
	delete StrTabFragment;
	}

	/// This is a valid number for use with .cv_loc if we've already seen a .cv_file
	/// for it.
	bool CodeViewContext::isValidFileNumber(unsigned FileNumber) const {
	unsigned Idx = FileNumber - 1;
	if (Idx < Files.size())
	return Files[Idx].Assigned;
	return false;
	}

	bool CodeViewContext::addFile(MCStreamer &OS, unsigned FileNumber,
	StringRef Filename,
	ArrayRef<uint8_t> ChecksumBytes,
	uint8_t ChecksumKind) {
	assert(FileNumber > 0);
	auto FilenameOffset = addToStringTable(Filename);
	Filename = FilenameOffset.first;
	unsigned Idx = FileNumber - 1;
	if (Idx >= Files.size())
	Files.resize(Idx + 1);

	if (Filename.empty())
	Filename = "<stdin>";

	if (Files[Idx].Assigned)
	return false;

	FilenameOffset = addToStringTable(Filename);
	Filename = FilenameOffset.first;
	unsigned Offset = FilenameOffset.second;

	auto ChecksumOffsetSymbol =
	OS.getContext().createTempSymbol("checksum_offset", false);
	Files[Idx].StringTableOffset = Offset;
	Files[Idx].ChecksumTableOffset = ChecksumOffsetSymbol;
	Files[Idx].Assigned = true;
	Files[Idx].Checksum = ChecksumBytes;
	Files[Idx].ChecksumKind = ChecksumKind;

	return true;
	}

	+MCCVFunctionInfo *CodeViewContext::getCVFunctionInfo(unsigned FuncId) {
	+ if (FuncId >= Functions.size())
	+ return nullptr;
	+ if (Functions[FuncId].isUnallocatedFunctionInfo())
	+ return nullptr;
	+ return &Functions[FuncId];
	+}
	+
	bool CodeViewContext::recordFunctionId(unsigned FuncId) {
	if (FuncId >= Functions.size())
	Functions.resize(FuncId + 1);

	// Return false if this function info was already allocated.
	if (!Functions[FuncId].isUnallocatedFunctionInfo())
	return false;

	// Mark this as an allocated normal function, and leave the rest alone.
	Functions[FuncId].ParentFuncIdPlusOne = MCCVFunctionInfo::FunctionSentinel;
	return true;
	}

	bool CodeViewContext::recordInlinedCallSiteId(unsigned FuncId, unsigned IAFunc,
	unsigned IAFile, unsigned IALine,
	unsigned IACol) {
	if (FuncId >= Functions.size())
	Functions.resize(FuncId + 1);

	// Return false if this function info was already allocated.
	if (!Functions[FuncId].isUnallocatedFunctionInfo())
	return false;

	MCCVFunctionInfo::LineInfo InlinedAt;
	InlinedAt.File = IAFile;
	InlinedAt.Line = IALine;
	InlinedAt.Col = IACol;

	// Mark this as an inlined call site and record call site line info.
	MCCVFunctionInfo *Info = &Functions[FuncId];
	Info->ParentFuncIdPlusOne = IAFunc + 1;
	Info->InlinedAt = InlinedAt;

	// Walk up the call chain adding this function id to the InlinedAtMap of all
	// transitive callers until we hit a real function.
	while (Info->isInlinedCallSite()) {
	InlinedAt = Info->InlinedAt;
	Info = getCVFunctionInfo(Info->getParentFuncId());
	Info->InlinedAtMap[FuncId] = InlinedAt;
	}

	return true;
	}

	MCDataFragment *CodeViewContext::getStringTableFragment() {
	if (!StrTabFragment) {
	StrTabFragment = new MCDataFragment();
	// Start a new string table out with a null byte.
	StrTabFragment->getContents().push_back('\0');
	}
	return StrTabFragment;
	}

	std::pair<StringRef, unsigned> CodeViewContext::addToStringTable(StringRef S) {
	SmallVectorImpl<char> &Contents = getStringTableFragment()->getContents();
	auto Insertion =
	StringTable.insert(std::make_pair(S, unsigned(Contents.size())));
	// Return the string from the table, since it is stable.
	std::pair<StringRef, unsigned> Ret =
	std::make_pair(Insertion.first->first(), Insertion.first->second);
	if (Insertion.second) {
	// The string map key is always null terminated.
	Contents.append(Ret.first.begin(), Ret.first.end() + 1);
	}
	return Ret;
	}

	unsigned CodeViewContext::getStringTableOffset(StringRef S) {
	// A string table offset of zero is always the empty string.
	if (S.empty())
	return 0;
	auto I = StringTable.find(S);
	assert(I != StringTable.end());
	return I->second;
	}

	void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
	MCContext &Ctx = OS.getContext();
	MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
	*StringEnd = Ctx.createTempSymbol("strtab_end", false);

	OS.EmitIntValue(unsigned(DebugSubsectionKind::StringTable), 4);
	OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
	OS.EmitLabel(StringBegin);

	// Put the string table data fragment here, if we haven't already put it
	// somewhere else. If somebody wants two string tables in their .s file, one
	// will just be empty.
	if (!InsertedStrTabFragment) {
	OS.insert(getStringTableFragment());
	InsertedStrTabFragment = true;
	}

	OS.EmitValueToAlignment(4, 0);

	OS.EmitLabel(StringEnd);
	}

	void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
	// Do nothing if there are no file checksums. Microsoft's linker rejects empty
	// CodeView substreams.
	if (Files.empty())
	return;

	MCContext &Ctx = OS.getContext();
	MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
	*FileEnd = Ctx.createTempSymbol("filechecksums_end", false);

	OS.EmitIntValue(unsigned(DebugSubsectionKind::FileChecksums), 4);
	OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
	OS.EmitLabel(FileBegin);

	unsigned CurrentOffset = 0;

	// Emit an array of FileChecksum entries. We index into this table using the
	// user-provided file number. Each entry may be a variable number of bytes
	// determined by the checksum kind and size.
	for (auto File : Files) {
	OS.EmitAssignment(File.ChecksumTableOffset,
	MCConstantExpr::create(CurrentOffset, Ctx));
	CurrentOffset += 4; // String table offset.
	if (!File.ChecksumKind) {
	CurrentOffset +=
	4; // One byte each for checksum size and kind, then align to 4 bytes.
	} else {
	CurrentOffset += 2; // One byte each for checksum size and kind.
	CurrentOffset += File.Checksum.size();
	CurrentOffset = alignTo(CurrentOffset, 4);
	}

	OS.EmitIntValue(File.StringTableOffset, 4);

	if (!File.ChecksumKind) {
	// There is no checksum. Therefore zero the next two fields and align
	// back to 4 bytes.
	OS.EmitIntValue(0, 4);
	continue;
	}
	OS.EmitIntValue(static_cast<uint8_t>(File.Checksum.size()), 1);
	OS.EmitIntValue(File.ChecksumKind, 1);
	OS.EmitBytes(toStringRef(File.Checksum));
	OS.EmitValueToAlignment(4);
	}

	OS.EmitLabel(FileEnd);

	ChecksumOffsetsAssigned = true;
	}

	// Output checksum table offset of the given file number. It is possible that
	// not all files have been registered yet, and so the offset cannot be
	// calculated. In this case a symbol representing the offset is emitted, and
	// the value of this symbol will be fixed up at a later time.
	void CodeViewContext::emitFileChecksumOffset(MCObjectStreamer &OS,
	unsigned FileNo) {
	unsigned Idx = FileNo - 1;

	if (Idx >= Files.size())
	Files.resize(Idx + 1);

	if (ChecksumOffsetsAssigned) {
	OS.EmitSymbolValue(Files[Idx].ChecksumTableOffset, 4);
	return;
	}

	const MCSymbolRefExpr *SRE =
	MCSymbolRefExpr::create(Files[Idx].ChecksumTableOffset, OS.getContext());

	OS.EmitValueImpl(SRE, 4);
	+}
	+
	+void CodeViewContext::addLineEntry(const MCCVLineEntry &LineEntry) {
	+ size_t Offset = MCCVLines.size();
	+ auto I = MCCVLineStartStop.insert(
	+ {LineEntry.getFunctionId(), {Offset, Offset + 1}});
	+ if (!I.second)
	+ I.first->second.second = Offset + 1;
	+ MCCVLines.push_back(LineEntry);
	+}
	+
	+std::vector<MCCVLineEntry>
	+CodeViewContext::getFunctionLineEntries(unsigned FuncId) {
	+ std::vector<MCCVLineEntry> FilteredLines;
	+ auto I = MCCVLineStartStop.find(FuncId);
	+ if (I != MCCVLineStartStop.end()) {
	+ MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(FuncId);
	+ for (size_t Idx = I->second.first, End = I->second.second; Idx != End;
	+ ++Idx) {
	+ unsigned LocationFuncId = MCCVLines[Idx].getFunctionId();
	+ if (LocationFuncId == FuncId) {
	+ // This was a .cv_loc directly for FuncId, so record it.
	+ FilteredLines.push_back(MCCVLines[Idx]);
	+ } else {
	+ // Check if the current location is inlined in this function. If it is,
	+ // synthesize a statement .cv_loc at the original inlined call site.
	+ auto I = SiteInfo->InlinedAtMap.find(LocationFuncId);
	+ if (I != SiteInfo->InlinedAtMap.end()) {
	+ MCCVFunctionInfo::LineInfo &IA = I->second;
	+ // Only add the location if it differs from the previous location.
	+ // Large inlined calls will have many .cv_loc entries and we only need
	+ // one line table entry in the parent function.
	+ if (FilteredLines.empty() \|\|
	+ FilteredLines.back().getFileNum() != IA.File \|\|
	+ FilteredLines.back().getLine() != IA.Line \|\|
	+ FilteredLines.back().getColumn() != IA.Col) {
	+ FilteredLines.push_back(MCCVLineEntry(
	+ MCCVLines[Idx].getLabel(),
	+ MCCVLoc(FuncId, IA.File, IA.Line, IA.Col, false, false)));
	+ }
	+ }
	+ }
	+ }
	+ }
	+ return FilteredLines;
	+}
	+
	+std::pair<size_t, size_t> CodeViewContext::getLineExtent(unsigned FuncId) {
	+ auto I = MCCVLineStartStop.find(FuncId);
	+ // Return an empty extent if there are no cv_locs for this function id.
	+ if (I == MCCVLineStartStop.end())
	+ return {~0ULL, 0};
	+ return I->second;
	+}
	+
	+ArrayRef<MCCVLineEntry> CodeViewContext::getLinesForExtent(size_t L, size_t R) {
	+ if (R <= L)
	+ return None;
	+ if (L >= MCCVLines.size())
	+ return None;
	+ return makeArrayRef(&MCCVLines[L], R - L);
	}

	void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
	unsigned FuncId,
	const MCSymbol *FuncBegin,
	const MCSymbol *FuncEnd) {
	MCContext &Ctx = OS.getContext();
	MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
	*LineEnd = Ctx.createTempSymbol("linetable_end", false);

	OS.EmitIntValue(unsigned(DebugSubsectionKind::Lines), 4);
	OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
	OS.EmitLabel(LineBegin);
	OS.EmitCOFFSecRel32(FuncBegin, /Offset=/0);
	OS.EmitCOFFSectionIndex(FuncBegin);

	// Actual line info.
	std::vector<MCCVLineEntry> Locs = getFunctionLineEntries(FuncId);
	bool HaveColumns = any_of(Locs, [](const MCCVLineEntry &LineEntry) {
	return LineEntry.getColumn() != 0;
	});
	OS.EmitIntValue(HaveColumns ? int(LF_HaveColumns) : 0, 2);
	OS.emitAbsoluteSymbolDiff(FuncEnd, FuncBegin, 4);

	for (auto I = Locs.begin(), E = Locs.end(); I != E;) {
	// Emit a file segment for the run of locations that share a file id.
	unsigned CurFileNum = I->getFileNum();
	auto FileSegEnd =
	std::find_if(I, E, [CurFileNum](const MCCVLineEntry &Loc) {
	return Loc.getFileNum() != CurFileNum;
	});
	unsigned EntryCount = FileSegEnd - I;
	OS.AddComment(
	"Segment for file '" +
	Twine(getStringTableFragment()
	->getContents()[Files[CurFileNum - 1].StringTableOffset]) +
	"' begins");
	OS.EmitCVFileChecksumOffsetDirective(CurFileNum);
	OS.EmitIntValue(EntryCount, 4);
	uint32_t SegmentSize = 12;
	SegmentSize += 8 * EntryCount;
	if (HaveColumns)
	SegmentSize += 4 * EntryCount;
	OS.EmitIntValue(SegmentSize, 4);

	for (auto J = I; J != FileSegEnd; ++J) {
	OS.emitAbsoluteSymbolDiff(J->getLabel(), FuncBegin, 4);
	unsigned LineData = J->getLine();
	if (J->isStmt())
	LineData \|= LineInfo::StatementFlag;
	OS.EmitIntValue(LineData, 4);
	}
	if (HaveColumns) {
	for (auto J = I; J != FileSegEnd; ++J) {
	OS.EmitIntValue(J->getColumn(), 2);
	OS.EmitIntValue(0, 2);
	}
	}
	I = FileSegEnd;
	}
	OS.EmitLabel(LineEnd);
	}

	static bool compressAnnotation(uint32_t Data, SmallVectorImpl<char> &Buffer) {
	if (isUInt<7>(Data)) {
	Buffer.push_back(Data);
	return true;
	}

	if (isUInt<14>(Data)) {
	Buffer.push_back((Data >> 8) \| 0x80);
	Buffer.push_back(Data & 0xff);
	return true;
	}

	if (isUInt<29>(Data)) {
	Buffer.push_back((Data >> 24) \| 0xC0);
	Buffer.push_back((Data >> 16) & 0xff);
	Buffer.push_back((Data >> 8) & 0xff);
	Buffer.push_back(Data & 0xff);
	return true;
	}

	return false;
	}

	static bool compressAnnotation(BinaryAnnotationsOpCode Annotation,
	SmallVectorImpl<char> &Buffer) {
	return compressAnnotation(static_cast<uint32_t>(Annotation), Buffer);
	}

	static uint32_t encodeSignedNumber(uint32_t Data) {
	if (Data >> 31)
	return ((-Data) << 1) \| 1;
	return Data << 1;
	}

	void CodeViewContext::emitInlineLineTableForFunction(MCObjectStreamer &OS,
	unsigned PrimaryFunctionId,
	unsigned SourceFileId,
	unsigned SourceLineNum,
	const MCSymbol *FnStartSym,
	const MCSymbol *FnEndSym) {
	// Create and insert a fragment into the current section that will be encoded
	// later.
	new MCCVInlineLineTableFragment(PrimaryFunctionId, SourceFileId,
	SourceLineNum, FnStartSym, FnEndSym,
	OS.getCurrentSectionOnly());
	}

	void CodeViewContext::emitDefRange(
	MCObjectStreamer &OS,
	ArrayRef<std::pair<const MCSymbol , const MCSymbol >> Ranges,
	StringRef FixedSizePortion) {
	// Create and insert a fragment into the current section that will be encoded
	// later.
	new MCCVDefRangeFragment(Ranges, FixedSizePortion,
	OS.getCurrentSectionOnly());
	}

	static unsigned computeLabelDiff(MCAsmLayout &Layout, const MCSymbol *Begin,
	const MCSymbol *End) {
	MCContext &Ctx = Layout.getAssembler().getContext();
	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
	const MCExpr *BeginRef = MCSymbolRefExpr::create(Begin, Variant, Ctx),
	*EndRef = MCSymbolRefExpr::create(End, Variant, Ctx);
	const MCExpr *AddrDelta =
	MCBinaryExpr::create(MCBinaryExpr::Sub, EndRef, BeginRef, Ctx);
	int64_t Result;
	bool Success = AddrDelta->evaluateKnownAbsolute(Result, Layout);
	assert(Success && "failed to evaluate label difference as absolute");
	(void)Success;
	assert(Result >= 0 && "negative label difference requested");
	assert(Result < UINT_MAX && "label difference greater than 2GB");
	return unsigned(Result);
	}

	void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
	MCCVInlineLineTableFragment &Frag) {
	size_t LocBegin;
	size_t LocEnd;
	std::tie(LocBegin, LocEnd) = getLineExtent(Frag.SiteFuncId);

	// Include all child inline call sites in our .cv_loc extent.
	MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(Frag.SiteFuncId);
	for (auto &KV : SiteInfo->InlinedAtMap) {
	unsigned ChildId = KV.first;
	auto Extent = getLineExtent(ChildId);
	LocBegin = std::min(LocBegin, Extent.first);
	LocEnd = std::max(LocEnd, Extent.second);
	}

	if (LocBegin >= LocEnd)
	return;
	ArrayRef<MCCVLineEntry> Locs = getLinesForExtent(LocBegin, LocEnd);
	if (Locs.empty())
	return;

	// Make an artificial start location using the function start and the inlinee
	// lines start location information. All deltas start relative to this
	// location.
	MCCVLineEntry StartLoc(Frag.getFnStartSym(), MCCVLoc(Locs.front()));
	StartLoc.setFileNum(Frag.StartFileId);
	StartLoc.setLine(Frag.StartLineNum);
	bool HaveOpenRange = false;

	const MCSymbol *LastLabel = Frag.getFnStartSym();
	MCCVFunctionInfo::LineInfo LastSourceLoc, CurSourceLoc;
	LastSourceLoc.File = Frag.StartFileId;
	LastSourceLoc.Line = Frag.StartLineNum;

	SmallVectorImpl<char> &Buffer = Frag.getContents();
	Buffer.clear(); // Clear old contents if we went through relaxation.
	for (const MCCVLineEntry &Loc : Locs) {
	// Exit early if our line table would produce an oversized InlineSiteSym
	// record. Account for the ChangeCodeLength annotation emitted after the
	// loop ends.
	constexpr uint32_t InlineSiteSize = 12;
	constexpr uint32_t AnnotationSize = 8;
	size_t MaxBufferSize = MaxRecordLength - InlineSiteSize - AnnotationSize;
	if (Buffer.size() >= MaxBufferSize)
	break;

	if (Loc.getFunctionId() == Frag.SiteFuncId) {
	CurSourceLoc.File = Loc.getFileNum();
	CurSourceLoc.Line = Loc.getLine();
	} else {
	auto I = SiteInfo->InlinedAtMap.find(Loc.getFunctionId());
	if (I != SiteInfo->InlinedAtMap.end()) {
	// This .cv_loc is from a child inline call site. Use the source
	// location of the inlined call site instead of the .cv_loc directive
	// source location.
	CurSourceLoc = I->second;
	} else {
	// We've hit a cv_loc not attributed to this inline call site. Use this
	// label to end the PC range.
	if (HaveOpenRange) {
	unsigned Length = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
	compressAnnotation(Length, Buffer);
	LastLabel = Loc.getLabel();
	}
	HaveOpenRange = false;
	continue;
	}
	}

	// Skip this .cv_loc if we have an open range and this isn't a meaningful
	// source location update. The current table format does not support column
	// info, so we can skip updates for those.
	if (HaveOpenRange && CurSourceLoc.File == LastSourceLoc.File &&
	CurSourceLoc.Line == LastSourceLoc.Line)
	continue;

	HaveOpenRange = true;

	if (CurSourceLoc.File != LastSourceLoc.File) {
	unsigned FileOffset = static_cast<const MCConstantExpr *>(
	Files[CurSourceLoc.File - 1]
	.ChecksumTableOffset->getVariableValue())
	->getValue();
	compressAnnotation(BinaryAnnotationsOpCode::ChangeFile, Buffer);
	compressAnnotation(FileOffset, Buffer);
	}

	int LineDelta = CurSourceLoc.Line - LastSourceLoc.Line;
	unsigned EncodedLineDelta = encodeSignedNumber(LineDelta);
	unsigned CodeDelta = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
	if (CodeDelta == 0 && LineDelta != 0) {
	compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
	compressAnnotation(EncodedLineDelta, Buffer);
	} else if (EncodedLineDelta < 0x8 && CodeDelta <= 0xf) {
	// The ChangeCodeOffsetAndLineOffset combination opcode is used when the
	// encoded line delta uses 3 or fewer set bits and the code offset fits
	// in one nibble.
	unsigned Operand = (EncodedLineDelta << 4) \| CodeDelta;
	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset,
	Buffer);
	compressAnnotation(Operand, Buffer);
	} else {
	// Otherwise use the separate line and code deltas.
	if (LineDelta != 0) {
	compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
	compressAnnotation(EncodedLineDelta, Buffer);
	}
	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffset, Buffer);
	compressAnnotation(CodeDelta, Buffer);
	}

	LastLabel = Loc.getLabel();
	LastSourceLoc = CurSourceLoc;
	}

	assert(HaveOpenRange);

	unsigned EndSymLength =
	computeLabelDiff(Layout, LastLabel, Frag.getFnEndSym());
	unsigned LocAfterLength = ~0U;
	ArrayRef<MCCVLineEntry> LocAfter = getLinesForExtent(LocEnd, LocEnd + 1);
	if (!LocAfter.empty()) {
	// Only try to compute this difference if we're in the same section.
	const MCCVLineEntry &Loc = LocAfter[0];
	if (&Loc.getLabel()->getSection(false) == &LastLabel->getSection(false))
	LocAfterLength = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
	}

	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
	compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
	}

	void CodeViewContext::encodeDefRange(MCAsmLayout &Layout,
	MCCVDefRangeFragment &Frag) {
	MCContext &Ctx = Layout.getAssembler().getContext();
	SmallVectorImpl<char> &Contents = Frag.getContents();
	Contents.clear();
	SmallVectorImpl<MCFixup> &Fixups = Frag.getFixups();
	Fixups.clear();
	raw_svector_ostream OS(Contents);

	// Compute all the sizes up front.
	SmallVector<std::pair<unsigned, unsigned>, 4> GapAndRangeSizes;
	const MCSymbol *LastLabel = nullptr;
	for (std::pair<const MCSymbol , const MCSymbol > Range : Frag.getRanges()) {
	unsigned GapSize =
	LastLabel ? computeLabelDiff(Layout, LastLabel, Range.first) : 0;
	unsigned RangeSize = computeLabelDiff(Layout, Range.first, Range.second);
	GapAndRangeSizes.push_back({GapSize, RangeSize});
	LastLabel = Range.second;
	}

	// Write down each range where the variable is defined.
	for (size_t I = 0, E = Frag.getRanges().size(); I != E;) {
	// If the range size of multiple consecutive ranges is under the max,
	// combine the ranges and emit some gaps.
	const MCSymbol *RangeBegin = Frag.getRanges()[I].first;
	unsigned RangeSize = GapAndRangeSizes[I].second;
	size_t J = I + 1;
	for (; J != E; ++J) {
	unsigned GapAndRangeSize = GapAndRangeSizes[J].first + GapAndRangeSizes[J].second;
	if (RangeSize + GapAndRangeSize > MaxDefRange)
	break;
	RangeSize += GapAndRangeSize;
	}
	unsigned NumGaps = J - I - 1;

	support::endian::Writer<support::little> LEWriter(OS);

	unsigned Bias = 0;
	// We must split the range into chunks of MaxDefRange, this is a fundamental
	// limitation of the file format.
	do {
	uint16_t Chunk = std::min((uint32_t)MaxDefRange, RangeSize);

	const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(RangeBegin, Ctx);
	const MCBinaryExpr *BE =
	MCBinaryExpr::createAdd(SRE, MCConstantExpr::create(Bias, Ctx), Ctx);
	MCValue Res;
	BE->evaluateAsRelocatable(Res, &Layout, /Fixup=/nullptr);

	// Each record begins with a 2-byte number indicating how large the record
	// is.
	StringRef FixedSizePortion = Frag.getFixedSizePortion();
	// Our record is a fixed sized prefix and a LocalVariableAddrRange that we
	// are artificially constructing.
	size_t RecordSize = FixedSizePortion.size() +
	sizeof(LocalVariableAddrRange) + 4 * NumGaps;
	// Write out the record size.
	LEWriter.write<uint16_t>(RecordSize);
	// Write out the fixed size prefix.
	OS << FixedSizePortion;
	// Make space for a fixup that will eventually have a section relative
	// relocation pointing at the offset where the variable becomes live.
	Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_4));
	LEWriter.write<uint32_t>(0); // Fixup for code start.
	// Make space for a fixup that will record the section index for the code.
	Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_2));
	LEWriter.write<uint16_t>(0); // Fixup for section index.
	// Write down the range's extent.
	LEWriter.write<uint16_t>(Chunk);

	// Move on to the next range.
	Bias += Chunk;
	RangeSize -= Chunk;
	} while (RangeSize > 0);

	// Emit the gaps afterwards.
	assert((NumGaps == 0 \|\| Bias <= MaxDefRange) &&
	"large ranges should not have gaps");
	unsigned GapStartOffset = GapAndRangeSizes[I].second;
	for (++I; I != J; ++I) {
	unsigned GapSize, RangeSize;
	assert(I < GapAndRangeSizes.size());
	std::tie(GapSize, RangeSize) = GapAndRangeSizes[I];
	LEWriter.write<uint16_t>(GapStartOffset);
	LEWriter.write<uint16_t>(GapSize);
	GapStartOffset += GapSize + RangeSize;
	}
	}
	}

	//
	// This is called when an instruction is assembled into the specified section
	// and if there is information from the last .cv_loc directive that has yet to have
	// a line entry made for it is made.
	//
	void MCCVLineEntry::Make(MCObjectStreamer *MCOS) {
	CodeViewContext &CVC = MCOS->getContext().getCVContext();
	if (!CVC.getCVLocSeen())
	return;

	// Create a symbol at in the current section for use in the line entry.
	MCSymbol *LineSym = MCOS->getContext().createTempSymbol();
	// Set the value of the symbol to use for the MCCVLineEntry.
	MCOS->EmitLabel(LineSym);

	// Get the current .loc info saved in the context.
	const MCCVLoc &CVLoc = CVC.getCurrentCVLoc();

	// Create a (local) line entry with the symbol and the current .loc info.
	MCCVLineEntry LineEntry(LineSym, CVLoc);

	// clear CVLocSeen saying the current .loc info is now used.
	CVC.clearCVLocSeen();

	// Add the line entry to this section's entries.
	CVC.addLineEntry(LineEntry);
	}
	Index: vendor/llvm/dist-release_60/lib/Target/AArch64/AArch64InstructionSelector.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/AArch64/AArch64InstructionSelector.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/AArch64/AArch64InstructionSelector.cpp (revision 328362)
	@@ -1,1532 +1,1566 @@
	//===- AArch64InstructionSelector.cpp ----------------------------- C++ --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	/// This file implements the targeting of the InstructionSelector class for
	/// AArch64.
	/// \todo This should be generated by TableGen.
	//===----------------------------------------------------------------------===//

	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64RegisterBankInfo.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "AArch64TargetMachine.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
	#include "llvm/CodeGen/GlobalISel/Utils.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"

	#define DEBUG_TYPE "aarch64-isel"

	using namespace llvm;

	namespace {

	#define GET_GLOBALISEL_PREDICATE_BITSET
	#include "AArch64GenGlobalISel.inc"
	#undef GET_GLOBALISEL_PREDICATE_BITSET

	class AArch64InstructionSelector : public InstructionSelector {
	public:
	AArch64InstructionSelector(const AArch64TargetMachine &TM,
	const AArch64Subtarget &STI,
	const AArch64RegisterBankInfo &RBI);

	bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
	static const char *getName() { return DEBUG_TYPE; }

	private:
	/// tblgen-erated 'select' implementation, used as the initial selector for
	/// the patterns that don't require complex C++.
	bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;

	bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
	MachineRegisterInfo &MRI) const;
	bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
	MachineRegisterInfo &MRI) const;

	bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
	MachineRegisterInfo &MRI) const;

	ComplexRendererFns selectArithImmed(MachineOperand &Root) const;

	ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
	unsigned Size) const;

	ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
	return selectAddrModeUnscaled(Root, 1);
	}
	ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
	return selectAddrModeUnscaled(Root, 2);
	}
	ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
	return selectAddrModeUnscaled(Root, 4);
	}
	ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
	return selectAddrModeUnscaled(Root, 8);
	}
	ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
	return selectAddrModeUnscaled(Root, 16);
	}

	ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
	unsigned Size) const;
	template <int Width>
	ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
	return selectAddrModeIndexed(Root, Width / 8);
	}

	const AArch64TargetMachine &TM;
	const AArch64Subtarget &STI;
	const AArch64InstrInfo &TII;
	const AArch64RegisterInfo &TRI;
	const AArch64RegisterBankInfo &RBI;

	#define GET_GLOBALISEL_PREDICATES_DECL
	#include "AArch64GenGlobalISel.inc"
	#undef GET_GLOBALISEL_PREDICATES_DECL

	// We declare the temporaries used by selectImpl() in the class to minimize the
	// cost of constructing placeholder values.
	#define GET_GLOBALISEL_TEMPORARIES_DECL
	#include "AArch64GenGlobalISel.inc"
	#undef GET_GLOBALISEL_TEMPORARIES_DECL
	};

	} // end anonymous namespace

	#define GET_GLOBALISEL_IMPL
	#include "AArch64GenGlobalISel.inc"
	#undef GET_GLOBALISEL_IMPL

	AArch64InstructionSelector::AArch64InstructionSelector(
	const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
	const AArch64RegisterBankInfo &RBI)
	: InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
	TRI(*STI.getRegisterInfo()), RBI(RBI),
	#define GET_GLOBALISEL_PREDICATES_INIT
	#include "AArch64GenGlobalISel.inc"
	#undef GET_GLOBALISEL_PREDICATES_INIT
	#define GET_GLOBALISEL_TEMPORARIES_INIT
	#include "AArch64GenGlobalISel.inc"
	#undef GET_GLOBALISEL_TEMPORARIES_INIT
	{
	}

	// FIXME: This should be target-independent, inferred from the types declared
	// for each class in the bank.
	static const TargetRegisterClass *
	getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
	const RegisterBankInfo &RBI) {
	if (RB.getID() == AArch64::GPRRegBankID) {
	if (Ty.getSizeInBits() <= 32)
	return &AArch64::GPR32RegClass;
	if (Ty.getSizeInBits() == 64)
	return &AArch64::GPR64RegClass;
	return nullptr;
	}

	if (RB.getID() == AArch64::FPRRegBankID) {
	if (Ty.getSizeInBits() == 32)
	return &AArch64::FPR32RegClass;
	if (Ty.getSizeInBits() == 64)
	return &AArch64::FPR64RegClass;
	if (Ty.getSizeInBits() == 128)
	return &AArch64::FPR128RegClass;
	return nullptr;
	}

	return nullptr;
	}

	/// Check whether \p I is a currently unsupported binary operation:
	/// - it has an unsized type
	/// - an operand is not a vreg
	/// - all operands are not in the same bank
	/// These are checks that should someday live in the verifier, but right now,
	/// these are mostly limitations of the aarch64 selector.
	static bool unsupportedBinOp(const MachineInstr &I,
	const AArch64RegisterBankInfo &RBI,
	const MachineRegisterInfo &MRI,
	const AArch64RegisterInfo &TRI) {
	LLT Ty = MRI.getType(I.getOperand(0).getReg());
	if (!Ty.isValid()) {
	DEBUG(dbgs() << "Generic binop register should be typed\n");
	return true;
	}

	const RegisterBank *PrevOpBank = nullptr;
	for (auto &MO : I.operands()) {
	// FIXME: Support non-register operands.
	if (!MO.isReg()) {
	DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
	return true;
	}

	// FIXME: Can generic operations have physical registers operands? If
	// so, this will need to be taught about that, and we'll need to get the
	// bank out of the minimal class for the register.
	// Either way, this needs to be documented (and possibly verified).
	if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
	DEBUG(dbgs() << "Generic inst has physical register operand\n");
	return true;
	}

	const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
	if (!OpBank) {
	DEBUG(dbgs() << "Generic register has no bank or class\n");
	return true;
	}

	if (PrevOpBank && OpBank != PrevOpBank) {
	DEBUG(dbgs() << "Generic inst operands have different banks\n");
	return true;
	}
	PrevOpBank = OpBank;
	}
	return false;
	}

	/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
	/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
	/// and of size \p OpSize.
	/// \returns \p GenericOpc if the combination is unsupported.
	static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
	unsigned OpSize) {
	switch (RegBankID) {
	case AArch64::GPRRegBankID:
	if (OpSize == 32) {
	switch (GenericOpc) {
	case TargetOpcode::G_SHL:
	return AArch64::LSLVWr;
	case TargetOpcode::G_LSHR:
	return AArch64::LSRVWr;
	case TargetOpcode::G_ASHR:
	return AArch64::ASRVWr;
	default:
	return GenericOpc;
	}
	} else if (OpSize == 64) {
	switch (GenericOpc) {
	case TargetOpcode::G_GEP:
	return AArch64::ADDXrr;
	case TargetOpcode::G_SHL:
	return AArch64::LSLVXr;
	case TargetOpcode::G_LSHR:
	return AArch64::LSRVXr;
	case TargetOpcode::G_ASHR:
	return AArch64::ASRVXr;
	default:
	return GenericOpc;
	}
	}
	break;
	case AArch64::FPRRegBankID:
	switch (OpSize) {
	case 32:
	switch (GenericOpc) {
	case TargetOpcode::G_FADD:
	return AArch64::FADDSrr;
	case TargetOpcode::G_FSUB:
	return AArch64::FSUBSrr;
	case TargetOpcode::G_FMUL:
	return AArch64::FMULSrr;
	case TargetOpcode::G_FDIV:
	return AArch64::FDIVSrr;
	default:
	return GenericOpc;
	}
	case 64:
	switch (GenericOpc) {
	case TargetOpcode::G_FADD:
	return AArch64::FADDDrr;
	case TargetOpcode::G_FSUB:
	return AArch64::FSUBDrr;
	case TargetOpcode::G_FMUL:
	return AArch64::FMULDrr;
	case TargetOpcode::G_FDIV:
	return AArch64::FDIVDrr;
	case TargetOpcode::G_OR:
	return AArch64::ORRv8i8;
	default:
	return GenericOpc;
	}
	}
	break;
	}
	return GenericOpc;
	}

	/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
	/// appropriate for the (value) register bank \p RegBankID and of memory access
	/// size \p OpSize. This returns the variant with the base+unsigned-immediate
	/// addressing mode (e.g., LDRXui).
	/// \returns \p GenericOpc if the combination is unsupported.
	static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
	unsigned OpSize) {
	const bool isStore = GenericOpc == TargetOpcode::G_STORE;
	switch (RegBankID) {
	case AArch64::GPRRegBankID:
	switch (OpSize) {
	case 8:
	return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
	case 16:
	return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
	case 32:
	return isStore ? AArch64::STRWui : AArch64::LDRWui;
	case 64:
	return isStore ? AArch64::STRXui : AArch64::LDRXui;
	}
	break;
	case AArch64::FPRRegBankID:
	switch (OpSize) {
	case 8:
	return isStore ? AArch64::STRBui : AArch64::LDRBui;
	case 16:
	return isStore ? AArch64::STRHui : AArch64::LDRHui;
	case 32:
	return isStore ? AArch64::STRSui : AArch64::LDRSui;
	case 64:
	return isStore ? AArch64::STRDui : AArch64::LDRDui;
	}
	break;
	}
	return GenericOpc;
	}

	static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
	MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
	const RegisterBankInfo &RBI) {

	unsigned DstReg = I.getOperand(0).getReg();
	if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
	assert(I.isCopy() && "Generic operators do not allow physical registers");
	return true;
	}

	const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
	const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
	unsigned SrcReg = I.getOperand(1).getReg();
	const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
	(void)SrcSize;
	assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) \|\| I.isCopy()) &&
	"No phys reg on generic operators");
	assert(
	(DstSize == SrcSize \|\|
	// Copies are a mean to setup initial types, the number of
	// bits may not exactly match.
	(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
	DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) \|\|
	// Copies are a mean to copy bits around, as long as we are
	// on the same register class, that's fine. Otherwise, that
	// means we need some SUBREG_TO_REG or AND & co.
	(((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
	"Copy with different width?!");
	assert((DstSize <= 64 \|\| RegBank.getID() == AArch64::FPRRegBankID) &&
	"GPRs cannot get more than 64-bit width values");
	const TargetRegisterClass *RC = nullptr;

	if (RegBank.getID() == AArch64::FPRRegBankID) {
	if (DstSize <= 16)
	RC = &AArch64::FPR16RegClass;
	else if (DstSize <= 32)
	RC = &AArch64::FPR32RegClass;
	else if (DstSize <= 64)
	RC = &AArch64::FPR64RegClass;
	else if (DstSize <= 128)
	RC = &AArch64::FPR128RegClass;
	else {
	DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
	return false;
	}
	} else {
	assert(RegBank.getID() == AArch64::GPRRegBankID &&
	"Bitcast for the flags?");
	RC =
	DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass;
	}

	// No need to constrain SrcReg. It will get constrained when
	// we hit another of its use or its defs.
	// Copies do not have constraints.
	if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
	DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
	<< " operand\n");
	return false;
	}
	I.setDesc(TII.get(AArch64::COPY));
	return true;
	}

	static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
	if (!DstTy.isScalar() \|\| !SrcTy.isScalar())
	return GenericOpc;

	const unsigned DstSize = DstTy.getSizeInBits();
	const unsigned SrcSize = SrcTy.getSizeInBits();

	switch (DstSize) {
	case 32:
	switch (SrcSize) {
	case 32:
	switch (GenericOpc) {
	case TargetOpcode::G_SITOFP:
	return AArch64::SCVTFUWSri;
	case TargetOpcode::G_UITOFP:
	return AArch64::UCVTFUWSri;
	case TargetOpcode::G_FPTOSI:
	return AArch64::FCVTZSUWSr;
	case TargetOpcode::G_FPTOUI:
	return AArch64::FCVTZUUWSr;
	default:
	return GenericOpc;
	}
	case 64:
	switch (GenericOpc) {
	case TargetOpcode::G_SITOFP:
	return AArch64::SCVTFUXSri;
	case TargetOpcode::G_UITOFP:
	return AArch64::UCVTFUXSri;
	case TargetOpcode::G_FPTOSI:
	return AArch64::FCVTZSUWDr;
	case TargetOpcode::G_FPTOUI:
	return AArch64::FCVTZUUWDr;
	default:
	return GenericOpc;
	}
	default:
	return GenericOpc;
	}
	case 64:
	switch (SrcSize) {
	case 32:
	switch (GenericOpc) {
	case TargetOpcode::G_SITOFP:
	return AArch64::SCVTFUWDri;
	case TargetOpcode::G_UITOFP:
	return AArch64::UCVTFUWDri;
	case TargetOpcode::G_FPTOSI:
	return AArch64::FCVTZSUXSr;
	case TargetOpcode::G_FPTOUI:
	return AArch64::FCVTZUUXSr;
	default:
	return GenericOpc;
	}
	case 64:
	switch (GenericOpc) {
	case TargetOpcode::G_SITOFP:
	return AArch64::SCVTFUXDri;
	case TargetOpcode::G_UITOFP:
	return AArch64::UCVTFUXDri;
	case TargetOpcode::G_FPTOSI:
	return AArch64::FCVTZSUXDr;
	case TargetOpcode::G_FPTOUI:
	return AArch64::FCVTZUUXDr;
	default:
	return GenericOpc;
	}
	default:
	return GenericOpc;
	}
	default:
	return GenericOpc;
	};
	return GenericOpc;
	}

	static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
	switch (P) {
	default:
	llvm_unreachable("Unknown condition code!");
	case CmpInst::ICMP_NE:
	return AArch64CC::NE;
	case CmpInst::ICMP_EQ:
	return AArch64CC::EQ;
	case CmpInst::ICMP_SGT:
	return AArch64CC::GT;
	case CmpInst::ICMP_SGE:
	return AArch64CC::GE;
	case CmpInst::ICMP_SLT:
	return AArch64CC::LT;
	case CmpInst::ICMP_SLE:
	return AArch64CC::LE;
	case CmpInst::ICMP_UGT:
	return AArch64CC::HI;
	case CmpInst::ICMP_UGE:
	return AArch64CC::HS;
	case CmpInst::ICMP_ULT:
	return AArch64CC::LO;
	case CmpInst::ICMP_ULE:
	return AArch64CC::LS;
	}
	}

	static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (P) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case CmpInst::FCMP_OEQ:
	CondCode = AArch64CC::EQ;
	break;
	case CmpInst::FCMP_OGT:
	CondCode = AArch64CC::GT;
	break;
	case CmpInst::FCMP_OGE:
	CondCode = AArch64CC::GE;
	break;
	case CmpInst::FCMP_OLT:
	CondCode = AArch64CC::MI;
	break;
	case CmpInst::FCMP_OLE:
	CondCode = AArch64CC::LS;
	break;
	case CmpInst::FCMP_ONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case CmpInst::FCMP_ORD:
	CondCode = AArch64CC::VC;
	break;
	case CmpInst::FCMP_UNO:
	CondCode = AArch64CC::VS;
	break;
	case CmpInst::FCMP_UEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case CmpInst::FCMP_UGT:
	CondCode = AArch64CC::HI;
	break;
	case CmpInst::FCMP_UGE:
	CondCode = AArch64CC::PL;
	break;
	case CmpInst::FCMP_ULT:
	CondCode = AArch64CC::LT;
	break;
	case CmpInst::FCMP_ULE:
	CondCode = AArch64CC::LE;
	break;
	case CmpInst::FCMP_UNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	bool AArch64InstructionSelector::selectCompareBranch(
	MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {

	const unsigned CondReg = I.getOperand(0).getReg();
	MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
	MachineInstr *CCMI = MRI.getVRegDef(CondReg);
	if (CCMI->getOpcode() == TargetOpcode::G_TRUNC)
	CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg());
	if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
	return false;

	unsigned LHS = CCMI->getOperand(2).getReg();
	unsigned RHS = CCMI->getOperand(3).getReg();
	if (!getConstantVRegVal(RHS, MRI))
	std::swap(RHS, LHS);

	const auto RHSImm = getConstantVRegVal(RHS, MRI);
	if (!RHSImm \|\| *RHSImm != 0)
	return false;

	const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
	if (RB.getID() != AArch64::GPRRegBankID)
	return false;

	const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
	if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
	return false;

	const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
	unsigned CBOpc = 0;
	if (CmpWidth <= 32)
	CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
	else if (CmpWidth == 64)
	CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
	else
	return false;

	auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
	.addUse(LHS)
	.addMBB(DestMBB);

	constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
	I.eraseFromParent();
	return true;
	}

	bool AArch64InstructionSelector::selectVaStartAAPCS(
	MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
	return false;
	}

	bool AArch64InstructionSelector::selectVaStartDarwin(
	MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	unsigned ListReg = I.getOperand(0).getReg();

	unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);

	auto MIB =
	BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
	.addDef(ArgsAddrReg)
	.addFrameIndex(FuncInfo->getVarArgsStackIndex())
	.addImm(0)
	.addImm(0);

	constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

	MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
	.addUse(ArgsAddrReg)
	.addUse(ListReg)
	.addImm(0)
	.addMemOperand(*I.memoperands_begin());

	constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
	I.eraseFromParent();
	return true;
	}

	bool AArch64InstructionSelector::select(MachineInstr &I,
	CodeGenCoverage &CoverageInfo) const {
	assert(I.getParent() && "Instruction should be in a basic block!");
	assert(I.getParent()->getParent() && "Instruction should be in a function!");

	MachineBasicBlock &MBB = *I.getParent();
	MachineFunction &MF = *MBB.getParent();
	MachineRegisterInfo &MRI = MF.getRegInfo();

	unsigned Opcode = I.getOpcode();
	// G_PHI requires same handling as PHI
	if (!isPreISelGenericOpcode(Opcode) \|\| Opcode == TargetOpcode::G_PHI) {
	// Certain non-generic instructions also need some special handling.

	if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);

	if (Opcode == TargetOpcode::PHI \|\| Opcode == TargetOpcode::G_PHI) {
	const unsigned DefReg = I.getOperand(0).getReg();
	const LLT DefTy = MRI.getType(DefReg);

	const TargetRegisterClass *DefRC = nullptr;
	if (TargetRegisterInfo::isPhysicalRegister(DefReg)) {
	DefRC = TRI.getRegClass(DefReg);
	} else {
	const RegClassOrRegBank &RegClassOrBank =
	MRI.getRegClassOrRegBank(DefReg);

	DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
	if (!DefRC) {
	if (!DefTy.isValid()) {
	DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
	return false;
	}
	const RegisterBank &RB = RegClassOrBank.get<const RegisterBank >();
	DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
	if (!DefRC) {
	DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
	return false;
	}
	}
	}
	I.setDesc(TII.get(TargetOpcode::PHI));

	return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
	}

	if (I.isCopy())
	return selectCopy(I, TII, MRI, TRI, RBI);

	return true;
	}


	if (I.getNumOperands() != I.getNumExplicitOperands()) {
	DEBUG(dbgs() << "Generic instruction has unexpected implicit operands\n");
	return false;
	}

	if (selectImpl(I, CoverageInfo))
	return true;

	LLT Ty =
	I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};

	switch (Opcode) {
	case TargetOpcode::G_BRCOND: {
	if (Ty.getSizeInBits() > 32) {
	// We shouldn't need this on AArch64, but it would be implemented as an
	// EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
	// bit being tested is < 32.
	DEBUG(dbgs() << "G_BRCOND has type: " << Ty
	<< ", expected at most 32-bits");
	return false;
	}

	const unsigned CondReg = I.getOperand(0).getReg();
	MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();

	if (selectCompareBranch(I, MF, MRI))
	return true;

	auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
	.addUse(CondReg)
	.addImm(/bit offset=/0)
	.addMBB(DestMBB);

	I.eraseFromParent();
	return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
	}

	case TargetOpcode::G_BRINDIRECT: {
	I.setDesc(TII.get(AArch64::BR));
	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}

	case TargetOpcode::G_FCONSTANT:
	case TargetOpcode::G_CONSTANT: {
	const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;

	const LLT s32 = LLT::scalar(32);
	const LLT s64 = LLT::scalar(64);
	const LLT p0 = LLT::pointer(0, 64);

	const unsigned DefReg = I.getOperand(0).getReg();
	const LLT DefTy = MRI.getType(DefReg);
	const unsigned DefSize = DefTy.getSizeInBits();
	const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);

	// FIXME: Redundant check, but even less readable when factored out.
	if (isFP) {
	if (Ty != s32 && Ty != s64) {
	DEBUG(dbgs() << "Unable to materialize FP " << Ty
	<< " constant, expected: " << s32 << " or " << s64
	<< '\n');
	return false;
	}

	if (RB.getID() != AArch64::FPRRegBankID) {
	DEBUG(dbgs() << "Unable to materialize FP " << Ty
	<< " constant on bank: " << RB << ", expected: FPR\n");
	return false;
	}

	// The case when we have 0.0 is covered by tablegen. Reject it here so we
	// can be sure tablegen works correctly and isn't rescued by this code.
	if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0))
	return false;
	} else {
	// s32 and s64 are covered by tablegen.
	if (Ty != p0) {
	DEBUG(dbgs() << "Unable to materialize integer " << Ty
	<< " constant, expected: " << s32 << ", " << s64 << ", or "
	<< p0 << '\n');
	return false;
	}

	if (RB.getID() != AArch64::GPRRegBankID) {
	DEBUG(dbgs() << "Unable to materialize integer " << Ty
	<< " constant on bank: " << RB << ", expected: GPR\n");
	return false;
	}
	}

	const unsigned MovOpc =
	DefSize == 32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;

	I.setDesc(TII.get(MovOpc));

	if (isFP) {
	const TargetRegisterClass &GPRRC =
	DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
	const TargetRegisterClass &FPRRC =
	DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;

	const unsigned DefGPRReg = MRI.createVirtualRegister(&GPRRC);
	MachineOperand &RegOp = I.getOperand(0);
	RegOp.setReg(DefGPRReg);

	BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
	TII.get(AArch64::COPY))
	.addDef(DefReg)
	.addUse(DefGPRReg);

	if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
	DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
	return false;
	}

	MachineOperand &ImmOp = I.getOperand(1);
	// FIXME: Is going through int64_t always correct?
	ImmOp.ChangeToImmediate(
	ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
	} else if (I.getOperand(1).isCImm()) {
	uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
	I.getOperand(1).ChangeToImmediate(Val);
	} else if (I.getOperand(1).isImm()) {
	uint64_t Val = I.getOperand(1).getImm();
	I.getOperand(1).ChangeToImmediate(Val);
	}

	constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	return true;
	}
	case TargetOpcode::G_EXTRACT: {
	LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
	// Larger extracts are vectors, same-size extracts should be something else
	// by now (either split up or simplified to a COPY).
	if (SrcTy.getSizeInBits() > 64 \|\| Ty.getSizeInBits() > 32)
	return false;

	I.setDesc(TII.get(AArch64::UBFMXri));
	MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
	Ty.getSizeInBits() - 1);

	unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
	BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
	TII.get(AArch64::COPY))
	.addDef(I.getOperand(0).getReg())
	.addUse(DstReg, 0, AArch64::sub_32);
	RBI.constrainGenericRegister(I.getOperand(0).getReg(),
	AArch64::GPR32RegClass, MRI);
	I.getOperand(0).setReg(DstReg);

	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}

	case TargetOpcode::G_INSERT: {
	LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
	// Larger inserts are vectors, same-size ones should be something else by
	// now (split up or turned into COPYs).
	if (Ty.getSizeInBits() > 64 \|\| SrcTy.getSizeInBits() > 32)
	return false;

	I.setDesc(TII.get(AArch64::BFMXri));
	unsigned LSB = I.getOperand(3).getImm();
	unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
	I.getOperand(3).setImm((64 - LSB) % 64);
	MachineInstrBuilder(MF, I).addImm(Width - 1);

	unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
	BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
	TII.get(AArch64::SUBREG_TO_REG))
	.addDef(SrcReg)
	.addImm(0)
	.addUse(I.getOperand(2).getReg())
	.addImm(AArch64::sub_32);
	RBI.constrainGenericRegister(I.getOperand(2).getReg(),
	AArch64::GPR32RegClass, MRI);
	I.getOperand(2).setReg(SrcReg);

	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}
	case TargetOpcode::G_FRAME_INDEX: {
	// allocas and G_FRAME_INDEX are only supported in addrspace(0).
	if (Ty != LLT::pointer(0, 64)) {
	DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
	<< ", expected: " << LLT::pointer(0, 64) << '\n');
	return false;
	}
	I.setDesc(TII.get(AArch64::ADDXri));

	// MOs for a #0 shifted immediate.
	I.addOperand(MachineOperand::CreateImm(0));
	I.addOperand(MachineOperand::CreateImm(0));

	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}

	case TargetOpcode::G_GLOBAL_VALUE: {
	auto GV = I.getOperand(1).getGlobal();
	if (GV->isThreadLocal()) {
	// FIXME: we don't support TLS yet.
	return false;
	}
	unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
	if (OpFlags & AArch64II::MO_GOT) {
	I.setDesc(TII.get(AArch64::LOADgot));
	I.getOperand(1).setTargetFlags(OpFlags);
	+ } else if (TM.getCodeModel() == CodeModel::Large) {
	+ // Materialize the global using movz/movk instructions.
	+ unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
	+ auto InsertPt = std::next(I.getIterator());
	+ auto MovZ =
	+ BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi))
	+ .addDef(MovZDstReg);
	+ MovZ->addOperand(MF, I.getOperand(1));
	+ MovZ->getOperand(1).setTargetFlags(OpFlags \| AArch64II::MO_G0 \|
	+ AArch64II::MO_NC);
	+ MovZ->addOperand(MF, MachineOperand::CreateImm(0));
	+ constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
	+
	+ auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags,
	+ unsigned Offset, unsigned ForceDstReg) {
	+ unsigned DstReg =
	+ ForceDstReg ? ForceDstReg
	+ : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
	+ auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(),
	+ TII.get(AArch64::MOVKXi))
	+ .addDef(DstReg)
	+ .addReg(SrcReg);
	+ MovI->addOperand(MF, MachineOperand::CreateGA(
	+ GV, MovZ->getOperand(1).getOffset(), Flags));
	+ MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
	+ constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
	+ return DstReg;
	+ };
	+ unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(),
	+ AArch64II::MO_G1 \| AArch64II::MO_NC, 16, 0);
	+ DstReg = BuildMovK(DstReg, AArch64II::MO_G2 \| AArch64II::MO_NC, 32, 0);
	+ BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
	+ I.eraseFromParent();
	+ return true;
	} else {
	I.setDesc(TII.get(AArch64::MOVaddr));
	I.getOperand(1).setTargetFlags(OpFlags \| AArch64II::MO_PAGE);
	MachineInstrBuilder MIB(MF, I);
	MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
	OpFlags \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	}
	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}

	case TargetOpcode::G_LOAD:
	case TargetOpcode::G_STORE: {
	LLT MemTy = Ty;
	LLT PtrTy = MRI.getType(I.getOperand(1).getReg());

	if (PtrTy != LLT::pointer(0, 64)) {
	DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
	<< ", expected: " << LLT::pointer(0, 64) << '\n');
	return false;
	}

	auto &MemOp = **I.memoperands_begin();
	if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
	DEBUG(dbgs() << "Atomic load/store not supported yet\n");
	return false;
	}

	const unsigned PtrReg = I.getOperand(1).getReg();
	#ifndef NDEBUG
	const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
	// Sanity-check the pointer register.
	assert(PtrRB.getID() == AArch64::GPRRegBankID &&
	"Load/Store pointer operand isn't a GPR");
	assert(MRI.getType(PtrReg).isPointer() &&
	"Load/Store pointer operand isn't a pointer");
	#endif

	const unsigned ValReg = I.getOperand(0).getReg();
	const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);

	const unsigned NewOpc =
	selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemTy.getSizeInBits());
	if (NewOpc == I.getOpcode())
	return false;

	I.setDesc(TII.get(NewOpc));

	uint64_t Offset = 0;
	auto *PtrMI = MRI.getVRegDef(PtrReg);

	// Try to fold a GEP into our unsigned immediate addressing mode.
	if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
	if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
	int64_t Imm = *COff;
	const unsigned Size = MemTy.getSizeInBits() / 8;
	const unsigned Scale = Log2_32(Size);
	if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
	unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
	I.getOperand(1).setReg(Ptr2Reg);
	PtrMI = MRI.getVRegDef(Ptr2Reg);
	Offset = Imm / Size;
	}
	}
	}

	// If we haven't folded anything into our addressing mode yet, try to fold
	// a frame index into the base+offset.
	if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
	I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());

	I.addOperand(MachineOperand::CreateImm(Offset));

	// If we're storing a 0, use WZR/XZR.
	if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
	if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
	if (I.getOpcode() == AArch64::STRWui)
	I.getOperand(0).setReg(AArch64::WZR);
	else if (I.getOpcode() == AArch64::STRXui)
	I.getOperand(0).setReg(AArch64::XZR);
	}
	}

	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}

	case TargetOpcode::G_SMULH:
	case TargetOpcode::G_UMULH: {
	// Reject the various things we don't support yet.
	if (unsupportedBinOp(I, RBI, MRI, TRI))
	return false;

	const unsigned DefReg = I.getOperand(0).getReg();
	const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);

	if (RB.getID() != AArch64::GPRRegBankID) {
	DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
	return false;
	}

	if (Ty != LLT::scalar(64)) {
	DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
	<< ", expected: " << LLT::scalar(64) << '\n');
	return false;
	}

	unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
	: AArch64::UMULHrr;
	I.setDesc(TII.get(NewOpc));

	// Now that we selected an opcode, we need to constrain the register
	// operands to use appropriate classes.
	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}
	case TargetOpcode::G_FADD:
	case TargetOpcode::G_FSUB:
	case TargetOpcode::G_FMUL:
	case TargetOpcode::G_FDIV:

	case TargetOpcode::G_OR:
	case TargetOpcode::G_SHL:
	case TargetOpcode::G_LSHR:
	case TargetOpcode::G_ASHR:
	case TargetOpcode::G_GEP: {
	// Reject the various things we don't support yet.
	if (unsupportedBinOp(I, RBI, MRI, TRI))
	return false;

	const unsigned OpSize = Ty.getSizeInBits();

	const unsigned DefReg = I.getOperand(0).getReg();
	const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);

	const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
	if (NewOpc == I.getOpcode())
	return false;

	I.setDesc(TII.get(NewOpc));
	// FIXME: Should the type be always reset in setDesc?

	// Now that we selected an opcode, we need to constrain the register
	// operands to use appropriate classes.
	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}

	case TargetOpcode::G_PTR_MASK: {
	uint64_t Align = I.getOperand(2).getImm();
	if (Align >= 64 \|\| Align == 0)
	return false;

	uint64_t Mask = ~((1ULL << Align) - 1);
	I.setDesc(TII.get(AArch64::ANDXri));
	I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));

	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	}
	case TargetOpcode::G_PTRTOINT:
	case TargetOpcode::G_TRUNC: {
	const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
	const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());

	const unsigned DstReg = I.getOperand(0).getReg();
	const unsigned SrcReg = I.getOperand(1).getReg();

	const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
	const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);

	if (DstRB.getID() != SrcRB.getID()) {
	DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
	return false;
	}

	if (DstRB.getID() == AArch64::GPRRegBankID) {
	const TargetRegisterClass *DstRC =
	getRegClassForTypeOnBank(DstTy, DstRB, RBI);
	if (!DstRC)
	return false;

	const TargetRegisterClass *SrcRC =
	getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
	if (!SrcRC)
	return false;

	if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) \|\|
	!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
	DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
	return false;
	}

	if (DstRC == SrcRC) {
	// Nothing to be done
	} else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
	SrcTy == LLT::scalar(64)) {
	llvm_unreachable("TableGen can import this case");
	return false;
	} else if (DstRC == &AArch64::GPR32RegClass &&
	SrcRC == &AArch64::GPR64RegClass) {
	I.getOperand(1).setSubReg(AArch64::sub_32);
	} else {
	DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
	return false;
	}

	I.setDesc(TII.get(TargetOpcode::COPY));
	return true;
	} else if (DstRB.getID() == AArch64::FPRRegBankID) {
	if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
	I.setDesc(TII.get(AArch64::XTNv4i16));
	constrainSelectedInstRegOperands(I, TII, TRI, RBI);
	return true;
	}
	}

	return false;
	}

	case TargetOpcode::G_ANYEXT: {
	const unsigned DstReg = I.getOperand(0).getReg();
	const unsigned SrcReg = I.getOperand(1).getReg();

	const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
	if (RBDst.getID() != AArch64::GPRRegBankID) {
	DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n");
	return false;
	}

	const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
	if (RBSrc.getID() != AArch64::GPRRegBankID) {
	DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n");
	return false;
	}

	const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();

	if (DstSize == 0) {
	DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
	return false;
	}

	if (DstSize != 64 && DstSize > 32) {
	DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
	<< ", expected: 32 or 64\n");
	return false;
	}
	// At this point G_ANYEXT is just like a plain COPY, but we need
	// to explicitly form the 64-bit value if any.
	if (DstSize > 32) {
	unsigned ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
	BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
	.addDef(ExtSrc)
	.addImm(0)
	.addUse(SrcReg)
	.addImm(AArch64::sub_32);
	I.getOperand(1).setReg(ExtSrc);
	}
	return selectCopy(I, TII, MRI, TRI, RBI);
	}

	case TargetOpcode::G_ZEXT:
	case TargetOpcode::G_SEXT: {
	unsigned Opcode = I.getOpcode();
	const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
	SrcTy = MRI.getType(I.getOperand(1).getReg());
	const bool isSigned = Opcode == TargetOpcode::G_SEXT;
	const unsigned DefReg = I.getOperand(0).getReg();
	const unsigned SrcReg = I.getOperand(1).getReg();
	const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);

	if (RB.getID() != AArch64::GPRRegBankID) {
	DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
	<< ", expected: GPR\n");
	return false;
	}

	MachineInstr *ExtI;
	if (DstTy == LLT::scalar(64)) {
	// FIXME: Can we avoid manually doing this?
	if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
	DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
	<< " operand\n");
	return false;
	}

	const unsigned SrcXReg =
	MRI.createVirtualRegister(&AArch64::GPR64RegClass);
	BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
	.addDef(SrcXReg)
	.addImm(0)
	.addUse(SrcReg)
	.addImm(AArch64::sub_32);

	const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
	ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
	.addDef(DefReg)
	.addUse(SrcXReg)
	.addImm(0)
	.addImm(SrcTy.getSizeInBits() - 1);
	} else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
	const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
	ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
	.addDef(DefReg)
	.addUse(SrcReg)
	.addImm(0)
	.addImm(SrcTy.getSizeInBits() - 1);
	} else {
	return false;
	}

	constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);

	I.eraseFromParent();
	return true;
	}

	case TargetOpcode::G_SITOFP:
	case TargetOpcode::G_UITOFP:
	case TargetOpcode::G_FPTOSI:
	case TargetOpcode::G_FPTOUI: {
	const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
	SrcTy = MRI.getType(I.getOperand(1).getReg());
	const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
	if (NewOpc == Opcode)
	return false;

	I.setDesc(TII.get(NewOpc));
	constrainSelectedInstRegOperands(I, TII, TRI, RBI);

	return true;
	}


	case TargetOpcode::G_INTTOPTR:
	// The importer is currently unable to import pointer types since they
	// didn't exist in SelectionDAG.
	return selectCopy(I, TII, MRI, TRI, RBI);

	case TargetOpcode::G_BITCAST:
	// Imported SelectionDAG rules can handle every bitcast except those that
	// bitcast from a type to the same type. Ideally, these shouldn't occur
	// but we might not run an optimizer that deletes them.
	if (MRI.getType(I.getOperand(0).getReg()) ==
	MRI.getType(I.getOperand(1).getReg()))
	return selectCopy(I, TII, MRI, TRI, RBI);
	return false;

	case TargetOpcode::G_SELECT: {
	if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
	DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
	<< ", expected: " << LLT::scalar(1) << '\n');
	return false;
	}

	const unsigned CondReg = I.getOperand(1).getReg();
	const unsigned TReg = I.getOperand(2).getReg();
	const unsigned FReg = I.getOperand(3).getReg();

	unsigned CSelOpc = 0;

	if (Ty == LLT::scalar(32)) {
	CSelOpc = AArch64::CSELWr;
	} else if (Ty == LLT::scalar(64) \|\| Ty == LLT::pointer(0, 64)) {
	CSelOpc = AArch64::CSELXr;
	} else {
	return false;
	}

	MachineInstr &TstMI =
	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
	.addDef(AArch64::WZR)
	.addUse(CondReg)
	.addImm(AArch64_AM::encodeLogicalImmediate(1, 32));

	MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
	.addDef(I.getOperand(0).getReg())
	.addUse(TReg)
	.addUse(FReg)
	.addImm(AArch64CC::NE);

	constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
	constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);

	I.eraseFromParent();
	return true;
	}
	case TargetOpcode::G_ICMP: {
	if (Ty != LLT::scalar(32)) {
	DEBUG(dbgs() << "G_ICMP result has type: " << Ty
	<< ", expected: " << LLT::scalar(32) << '\n');
	return false;
	}

	unsigned CmpOpc = 0;
	unsigned ZReg = 0;

	LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
	if (CmpTy == LLT::scalar(32)) {
	CmpOpc = AArch64::SUBSWrr;
	ZReg = AArch64::WZR;
	} else if (CmpTy == LLT::scalar(64) \|\| CmpTy.isPointer()) {
	CmpOpc = AArch64::SUBSXrr;
	ZReg = AArch64::XZR;
	} else {
	return false;
	}

	// CSINC increments the result by one when the condition code is false.
	// Therefore, we have to invert the predicate to get an increment by 1 when
	// the predicate is true.
	const AArch64CC::CondCode invCC =
	changeICMPPredToAArch64CC(CmpInst::getInversePredicate(
	(CmpInst::Predicate)I.getOperand(1).getPredicate()));

	MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
	.addDef(ZReg)
	.addUse(I.getOperand(2).getReg())
	.addUse(I.getOperand(3).getReg());

	MachineInstr &CSetMI =
	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
	.addDef(I.getOperand(0).getReg())
	.addUse(AArch64::WZR)
	.addUse(AArch64::WZR)
	.addImm(invCC);

	constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
	constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);

	I.eraseFromParent();
	return true;
	}

	case TargetOpcode::G_FCMP: {
	if (Ty != LLT::scalar(32)) {
	DEBUG(dbgs() << "G_FCMP result has type: " << Ty
	<< ", expected: " << LLT::scalar(32) << '\n');
	return false;
	}

	unsigned CmpOpc = 0;
	LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
	if (CmpTy == LLT::scalar(32)) {
	CmpOpc = AArch64::FCMPSrr;
	} else if (CmpTy == LLT::scalar(64)) {
	CmpOpc = AArch64::FCMPDrr;
	} else {
	return false;
	}

	// FIXME: regbank

	AArch64CC::CondCode CC1, CC2;
	changeFCMPPredToAArch64CC(
	(CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);

	MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
	.addUse(I.getOperand(2).getReg())
	.addUse(I.getOperand(3).getReg());

	const unsigned DefReg = I.getOperand(0).getReg();
	unsigned Def1Reg = DefReg;
	if (CC2 != AArch64CC::AL)
	Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);

	MachineInstr &CSetMI =
	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
	.addDef(Def1Reg)
	.addUse(AArch64::WZR)
	.addUse(AArch64::WZR)
	.addImm(getInvertedCondCode(CC1));

	if (CC2 != AArch64CC::AL) {
	unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
	MachineInstr &CSet2MI =
	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
	.addDef(Def2Reg)
	.addUse(AArch64::WZR)
	.addUse(AArch64::WZR)
	.addImm(getInvertedCondCode(CC2));
	MachineInstr &OrMI =
	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
	.addDef(DefReg)
	.addUse(Def1Reg)
	.addUse(Def2Reg);
	constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
	constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
	}

	constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
	constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);

	I.eraseFromParent();
	return true;
	}
	case TargetOpcode::G_VASTART:
	return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
	: selectVaStartAAPCS(I, MF, MRI);
	case TargetOpcode::G_IMPLICIT_DEF:
	I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
	return true;
	}

	return false;
	}

	/// SelectArithImmed - Select an immediate value that can be represented as
	/// a 12-bit value shifted left by either 0 or 12. If so, return true with
	/// Val set to the 12-bit value and Shift set to the shifter operand.
	InstructionSelector::ComplexRendererFns
	AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
	MachineInstr &MI = *Root.getParent();
	MachineBasicBlock &MBB = *MI.getParent();
	MachineFunction &MF = *MBB.getParent();
	MachineRegisterInfo &MRI = MF.getRegInfo();

	// This function is called from the addsub_shifted_imm ComplexPattern,
	// which lists [imm] as the list of opcode it's interested in, however
	// we still need to check whether the operand is actually an immediate
	// here because the ComplexPattern opcode list is only used in
	// root-level opcode matching.
	uint64_t Immed;
	if (Root.isImm())
	Immed = Root.getImm();
	else if (Root.isCImm())
	Immed = Root.getCImm()->getZExtValue();
	else if (Root.isReg()) {
	MachineInstr *Def = MRI.getVRegDef(Root.getReg());
	if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
	return None;
	MachineOperand &Op1 = Def->getOperand(1);
	if (!Op1.isCImm() \|\| Op1.getCImm()->getBitWidth() > 64)
	return None;
	Immed = Op1.getCImm()->getZExtValue();
	} else
	return None;

	unsigned ShiftAmt;

	if (Immed >> 12 == 0) {
	ShiftAmt = 0;
	} else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
	ShiftAmt = 12;
	Immed = Immed >> 12;
	} else
	return None;

	unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
	return {{
	[=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
	[=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
	}};
	}

	/// Select a "register plus unscaled signed 9-bit immediate" address. This
	/// should only match when there is an offset that is not valid for a scaled
	/// immediate addressing mode. The "Size" argument is the size in bytes of the
	/// memory reference, which is needed here to know what is valid for a scaled
	/// immediate.
	InstructionSelector::ComplexRendererFns
	AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
	unsigned Size) const {
	MachineRegisterInfo &MRI =
	Root.getParent()->getParent()->getParent()->getRegInfo();

	if (!Root.isReg())
	return None;

	if (!isBaseWithConstantOffset(Root, MRI))
	return None;

	MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
	if (!RootDef)
	return None;

	MachineOperand &OffImm = RootDef->getOperand(2);
	if (!OffImm.isReg())
	return None;
	MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
	if (!RHS \|\| RHS->getOpcode() != TargetOpcode::G_CONSTANT)
	return None;
	int64_t RHSC;
	MachineOperand &RHSOp1 = RHS->getOperand(1);
	if (!RHSOp1.isCImm() \|\| RHSOp1.getCImm()->getBitWidth() > 64)
	return None;
	RHSC = RHSOp1.getCImm()->getSExtValue();

	// If the offset is valid as a scaled immediate, don't match here.
	if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
	return None;
	if (RHSC >= -256 && RHSC < 256) {
	MachineOperand &Base = RootDef->getOperand(1);
	return {{
	[=](MachineInstrBuilder &MIB) { MIB.add(Base); },
	[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
	}};
	}
	return None;
	}

	/// Select a "register plus scaled unsigned 12-bit immediate" address. The
	/// "Size" argument is the size in bytes of the memory reference, which
	/// determines the scale.
	InstructionSelector::ComplexRendererFns
	AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
	unsigned Size) const {
	MachineRegisterInfo &MRI =
	Root.getParent()->getParent()->getParent()->getRegInfo();

	if (!Root.isReg())
	return None;

	MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
	if (!RootDef)
	return None;

	if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
	return {{
	[=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
	[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
	}};
	}

	if (isBaseWithConstantOffset(Root, MRI)) {
	MachineOperand &LHS = RootDef->getOperand(1);
	MachineOperand &RHS = RootDef->getOperand(2);
	MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
	MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
	if (LHSDef && RHSDef) {
	int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
	unsigned Scale = Log2_32(Size);
	if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
	if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
	return {{
	[=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
	[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
	}};

	return {{
	[=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
	[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
	}};
	}
	}
	}

	// Before falling back to our general case, check if the unscaled
	// instructions can handle this. If so, that's preferable.
	if (selectAddrModeUnscaled(Root, Size).hasValue())
	return None;

	return {{
	[=](MachineInstrBuilder &MIB) { MIB.add(Root); },
	[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
	}};
	}

	namespace llvm {
	InstructionSelector *
	createAArch64InstructionSelector(const AArch64TargetMachine &TM,
	AArch64Subtarget &Subtarget,
	AArch64RegisterBankInfo &RBI) {
	return new AArch64InstructionSelector(TM, Subtarget, RBI);
	}
	}
	Index: vendor/llvm/dist-release_60/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp (revision 328362)
	@@ -1,2124 +1,2104 @@
	//===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "Hexagon.h"
	#include "HexagonISelDAGToDAG.h"
	#include "HexagonISelLowering.h"
	#include "HexagonTargetMachine.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"

	#include <deque>
	#include <map>
	#include <set>
	#include <utility>
	#include <vector>

	#define DEBUG_TYPE "hexagon-isel"

	using namespace llvm;

	namespace {

	// --------------------------------------------------------------------
	// Implementation of permutation networks.

	// Implementation of the node routing through butterfly networks:
	// - Forward delta.
	// - Reverse delta.
	// - Benes.
	//
	//
	// Forward delta network consists of log(N) steps, where N is the number
	// of inputs. In each step, an input can stay in place, or it can get
	// routed to another position[1]. The step after that consists of two
	// networks, each half in size in terms of the number of nodes. In those
	// terms, in the given step, an input can go to either the upper or the
	// lower network in the next step.
	//
	// [1] Hexagon's vdelta/vrdelta allow an element to be routed to both
	// positions as long as there is no conflict.

	// Here's a delta network for 8 inputs, only the switching routes are
	// shown:
	//
	// Steps:
	// \|- 1 ---------------\|- 2 -----\|- 3 -\|
	//
	// Inp[0] * * * * Out[0]
	// \ / \ / \ /
	// \ / \ / X
	// \ / \ / / \
	// Inp[1] * \ / * X * * Out[1]
	// \ \ / / \ / \ /
	// \ \ / / X X
	// \ \ / / / \ / \
	// Inp[2] * \ \ / / * X * * Out[2]
	// \ \ X / / / \ \ /
	// \ \ / \ / / / \ X
	// \ X X / / \ / \
	// Inp[3] * \ / \ / \ / * * * Out[3]
	// \ X X X /
	// \ / \ / \ / \ /
	// X X X X
	// / \ / \ / \ / \
	// / X X X \
	// Inp[4] * / \ / \ / \ * * * Out[4]
	// / X X \ \ / \ /
	// / / \ / \ \ \ / X
	// / / X \ \ \ / / \
	// Inp[5] * / / \ \ * X * * Out[5]
	// / / \ \ \ / \ /
	// / / \ \ X X
	// / / \ \ / \ / \
	// Inp[6] * / \ * X * * Out[6]
	// / \ / \ \ /
	// / \ / \ X
	// / \ / \ / \
	// Inp[7] * * * * Out[7]
	//
	//
	// Reverse delta network is same as delta network, with the steps in
	// the opposite order.
	//
	//
	// Benes network is a forward delta network immediately followed by
	// a reverse delta network.


	// Graph coloring utility used to partition nodes into two groups:
	// they will correspond to nodes routed to the upper and lower networks.
	struct Coloring {
	enum : uint8_t {
	None = 0,
	Red,
	Black
	};

	using Node = int;
	using MapType = std::map<Node,uint8_t>;
	static constexpr Node Ignore = Node(-1);

	Coloring(ArrayRef<Node> Ord) : Order(Ord) {
	build();
	if (!color())
	Colors.clear();
	}

	const MapType &colors() const {
	return Colors;
	}

	uint8_t other(uint8_t Color) {
	if (Color == None)
	return Red;
	return Color == Red ? Black : Red;
	}

	void dump() const;

	private:
	ArrayRef<Node> Order;
	MapType Colors;
	std::set<Node> Needed;

	using NodeSet = std::set<Node>;
	std::map<Node,NodeSet> Edges;

	Node conj(Node Pos) {
	Node Num = Order.size();
	return (Pos < Num/2) ? Pos + Num/2 : Pos - Num/2;
	}

	uint8_t getColor(Node N) {
	auto F = Colors.find(N);
	return F != Colors.end() ? F->second : (uint8_t)None;
	}

	std::pair<bool,uint8_t> getUniqueColor(const NodeSet &Nodes);

	void build();
	bool color();
	};
	} // namespace

	std::pair<bool,uint8_t> Coloring::getUniqueColor(const NodeSet &Nodes) {
	uint8_t Color = None;
	for (Node N : Nodes) {
	uint8_t ColorN = getColor(N);
	if (ColorN == None)
	continue;
	if (Color == None)
	Color = ColorN;
	else if (Color != None && Color != ColorN)
	return { false, None };
	}
	return { true, Color };
	}

	void Coloring::build() {
	// Add Order[P] and Order[conj(P)] to Edges.
	for (unsigned P = 0; P != Order.size(); ++P) {
	Node I = Order[P];
	if (I != Ignore) {
	Needed.insert(I);
	Node PC = Order[conj(P)];
	if (PC != Ignore && PC != I)
	Edges[I].insert(PC);
	}
	}
	// Add I and conj(I) to Edges.
	for (unsigned I = 0; I != Order.size(); ++I) {
	if (!Needed.count(I))
	continue;
	Node C = conj(I);
	// This will create an entry in the edge table, even if I is not
	// connected to any other node. This is necessary, because it still
	// needs to be colored.
	NodeSet &Is = Edges[I];
	if (Needed.count(C))
	Is.insert(C);
	}
	}

	bool Coloring::color() {
	SetVector<Node> FirstQ;
	auto Enqueue = [this,&FirstQ] (Node N) {
	SetVector<Node> Q;
	Q.insert(N);
	for (unsigned I = 0; I != Q.size(); ++I) {
	NodeSet &Ns = Edges[Q[I]];
	Q.insert(Ns.begin(), Ns.end());
	}
	FirstQ.insert(Q.begin(), Q.end());
	};
	for (Node N : Needed)
	Enqueue(N);

	for (Node N : FirstQ) {
	if (Colors.count(N))
	continue;
	NodeSet &Ns = Edges[N];
	auto P = getUniqueColor(Ns);
	if (!P.first)
	return false;
	Colors[N] = other(P.second);
	}

	// First, color nodes that don't have any dups.
	for (auto E : Edges) {
	Node N = E.first;
	if (!Needed.count(conj(N)) \|\| Colors.count(N))
	continue;
	auto P = getUniqueColor(E.second);
	if (!P.first)
	return false;
	Colors[N] = other(P.second);
	}

	// Now, nodes that are still uncolored. Since the graph can be modified
	// in this step, create a work queue.
	std::vector<Node> WorkQ;
	for (auto E : Edges) {
	Node N = E.first;
	if (!Colors.count(N))
	WorkQ.push_back(N);
	}

	for (unsigned I = 0; I < WorkQ.size(); ++I) {
	Node N = WorkQ[I];
	NodeSet &Ns = Edges[N];
	auto P = getUniqueColor(Ns);
	if (P.first) {
	Colors[N] = other(P.second);
	continue;
	}

	// Coloring failed. Split this node.
	Node C = conj(N);
	uint8_t ColorN = other(None);
	uint8_t ColorC = other(ColorN);
	NodeSet &Cs = Edges[C];
	NodeSet CopyNs = Ns;
	for (Node M : CopyNs) {
	uint8_t ColorM = getColor(M);
	if (ColorM == ColorC) {
	// Connect M with C, disconnect M from N.
	Cs.insert(M);
	Edges[M].insert(C);
	Ns.erase(M);
	Edges[M].erase(N);
	}
	}
	Colors[N] = ColorN;
	Colors[C] = ColorC;
	}

	// Explicitly assign "None" all all uncolored nodes.
	for (unsigned I = 0; I != Order.size(); ++I)
	if (Colors.count(I) == 0)
	Colors[I] = None;

	return true;
	}

	LLVM_DUMP_METHOD
	void Coloring::dump() const {
	dbgs() << "{ Order: {";
	for (unsigned I = 0; I != Order.size(); ++I) {
	Node P = Order[I];
	if (P != Ignore)
	dbgs() << ' ' << P;
	else
	dbgs() << " -";
	}
	dbgs() << " }\n";
	dbgs() << " Needed: {";
	for (Node N : Needed)
	dbgs() << ' ' << N;
	dbgs() << " }\n";

	dbgs() << " Edges: {\n";
	for (auto E : Edges) {
	dbgs() << " " << E.first << " -> {";
	for (auto N : E.second)
	dbgs() << ' ' << N;
	dbgs() << " }\n";
	}
	dbgs() << " }\n";

	static const char *const Names[] = { "None", "Red", "Black" };
	dbgs() << " Colors: {\n";
	for (auto C : Colors)
	dbgs() << " " << C.first << " -> " << Names[C.second] << "\n";
	dbgs() << " }\n}\n";
	}

	namespace {
	// Base class of for reordering networks. They don't strictly need to be
	// permutations, as outputs with repeated occurrences of an input element
	// are allowed.
	struct PermNetwork {
	using Controls = std::vector<uint8_t>;
	using ElemType = int;
	static constexpr ElemType Ignore = ElemType(-1);

	enum : uint8_t {
	None,
	Pass,
	Switch
	};
	enum : uint8_t {
	Forward,
	Reverse
	};

	PermNetwork(ArrayRef<ElemType> Ord, unsigned Mult = 1) {
	Order.assign(Ord.data(), Ord.data()+Ord.size());
	Log = 0;

	unsigned S = Order.size();
	while (S >>= 1)
	++Log;

	Table.resize(Order.size());
	for (RowType &Row : Table)
	Row.resize(Mult*Log, None);
	}

	void getControls(Controls &V, unsigned StartAt, uint8_t Dir) const {
	unsigned Size = Order.size();
	V.resize(Size);
	for (unsigned I = 0; I != Size; ++I) {
	unsigned W = 0;
	for (unsigned L = 0; L != Log; ++L) {
	unsigned C = ctl(I, StartAt+L) == Switch;
	if (Dir == Forward)
	W \|= C << (Log-1-L);
	else
	W \|= C << L;
	}
	assert(isUInt<8>(W));
	V[I] = uint8_t(W);
	}
	}

	uint8_t ctl(ElemType Pos, unsigned Step) const {
	return Table[Pos][Step];
	}
	unsigned size() const {
	return Order.size();
	}
	unsigned steps() const {
	return Log;
	}

	protected:
	unsigned Log;
	std::vector<ElemType> Order;
	using RowType = std::vector<uint8_t>;
	std::vector<RowType> Table;
	};

	struct ForwardDeltaNetwork : public PermNetwork {
	ForwardDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord) {}

	bool run(Controls &V) {
	if (!route(Order.data(), Table.data(), size(), 0))
	return false;
	getControls(V, 0, Forward);
	return true;
	}

	private:
	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
	};

	struct ReverseDeltaNetwork : public PermNetwork {
	ReverseDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord) {}

	bool run(Controls &V) {
	if (!route(Order.data(), Table.data(), size(), 0))
	return false;
	getControls(V, 0, Reverse);
	return true;
	}

	private:
	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
	};

	struct BenesNetwork : public PermNetwork {
	BenesNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord, 2) {}

	bool run(Controls &F, Controls &R) {
	if (!route(Order.data(), Table.data(), size(), 0))
	return false;

	getControls(F, 0, Forward);
	getControls(R, Log, Reverse);
	return true;
	}

	private:
	bool route(ElemType P, RowType T, unsigned Size, unsigned Step);
	};
	} // namespace

	bool ForwardDeltaNetwork::route(ElemType P, RowType T, unsigned Size,
	unsigned Step) {
	bool UseUp = false, UseDown = false;
	ElemType Num = Size;

	// Cannot use coloring here, because coloring is used to determine
	// the "big" switch, i.e. the one that changes halves, and in a forward
	// network, a color can be simultaneously routed to both halves in the
	// step we're working on.
	for (ElemType J = 0; J != Num; ++J) {
	ElemType I = P[J];
	// I is the position in the input,
	// J is the position in the output.
	if (I == Ignore)
	continue;
	uint8_t S;
	if (I < Num/2)
	S = (J < Num/2) ? Pass : Switch;
	else
	S = (J < Num/2) ? Switch : Pass;

	// U is the element in the table that needs to be updated.
	ElemType U = (S == Pass) ? I : (I < Num/2 ? I+Num/2 : I-Num/2);
	if (U < Num/2)
	UseUp = true;
	else
	UseDown = true;
	if (T[U][Step] != S && T[U][Step] != None)
	return false;
	T[U][Step] = S;
	}

	for (ElemType J = 0; J != Num; ++J)
	if (P[J] != Ignore && P[J] >= Num/2)
	P[J] -= Num/2;

	if (Step+1 < Log) {
	if (UseUp && !route(P, T, Size/2, Step+1))
	return false;
	if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
	return false;
	}
	return true;
	}

	bool ReverseDeltaNetwork::route(ElemType P, RowType T, unsigned Size,
	unsigned Step) {
	unsigned Pets = Log-1 - Step;
	bool UseUp = false, UseDown = false;
	ElemType Num = Size;

	// In this step half-switching occurs, so coloring can be used.
	Coloring G({P,Size});
	const Coloring::MapType &M = G.colors();
	if (M.empty())
	return false;

	uint8_t ColorUp = Coloring::None;
	for (ElemType J = 0; J != Num; ++J) {
	ElemType I = P[J];
	// I is the position in the input,
	// J is the position in the output.
	if (I == Ignore)
	continue;
	uint8_t C = M.at(I);
	if (C == Coloring::None)
	continue;
	// During "Step", inputs cannot switch halves, so if the "up" color
	// is still unknown, make sure that it is selected in such a way that
	// "I" will stay in the same half.
	bool InpUp = I < Num/2;
	if (ColorUp == Coloring::None)
	ColorUp = InpUp ? C : G.other(C);
	if ((C == ColorUp) != InpUp) {
	// If I should go to a different half than where is it now, give up.
	return false;
	}

	uint8_t S;
	if (InpUp) {
	S = (J < Num/2) ? Pass : Switch;
	UseUp = true;
	} else {
	S = (J < Num/2) ? Switch : Pass;
	UseDown = true;
	}
	T[J][Pets] = S;
	}

	// Reorder the working permutation according to the computed switch table
	// for the last step (i.e. Pets).
	for (ElemType J = 0, E = Size / 2; J != E; ++J) {
	ElemType PJ = P[J]; // Current values of P[J]
	ElemType PC = P[J+Size/2]; // and P[conj(J)]
	ElemType QJ = PJ; // New values of P[J]
	ElemType QC = PC; // and P[conj(J)]
	if (T[J][Pets] == Switch)
	QC = PJ;
	if (T[J+Size/2][Pets] == Switch)
	QJ = PC;
	P[J] = QJ;
	P[J+Size/2] = QC;
	}

	for (ElemType J = 0; J != Num; ++J)
	if (P[J] != Ignore && P[J] >= Num/2)
	P[J] -= Num/2;

	if (Step+1 < Log) {
	if (UseUp && !route(P, T, Size/2, Step+1))
	return false;
	if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
	return false;
	}
	return true;
	}

	bool BenesNetwork::route(ElemType P, RowType T, unsigned Size,
	unsigned Step) {
	Coloring G({P,Size});
	const Coloring::MapType &M = G.colors();
	if (M.empty())
	return false;
	ElemType Num = Size;

	unsigned Pets = 2*Log-1 - Step;
	bool UseUp = false, UseDown = false;

	// Both assignments, i.e. Red->Up and Red->Down are valid, but they will
	// result in different controls. Let's pick the one where the first
	// control will be "Pass".
	uint8_t ColorUp = Coloring::None;
	for (ElemType J = 0; J != Num; ++J) {
	ElemType I = P[J];
	if (I == Ignore)
	continue;
	uint8_t C = M.at(I);
	if (C == Coloring::None)
	continue;
	if (ColorUp == Coloring::None) {
	ColorUp = (I < Num/2) ? Coloring::Red : Coloring::Black;
	}
	unsigned CI = (I < Num/2) ? I+Num/2 : I-Num/2;
	if (C == ColorUp) {
	if (I < Num/2)
	T[I][Step] = Pass;
	else
	T[CI][Step] = Switch;
	T[J][Pets] = (J < Num/2) ? Pass : Switch;
	UseUp = true;
	} else { // Down
	if (I < Num/2)
	T[CI][Step] = Switch;
	else
	T[I][Step] = Pass;
	T[J][Pets] = (J < Num/2) ? Switch : Pass;
	UseDown = true;
	}
	}

	// Reorder the working permutation according to the computed switch table
	// for the last step (i.e. Pets).
	for (ElemType J = 0; J != Num/2; ++J) {
	ElemType PJ = P[J]; // Current values of P[J]
	ElemType PC = P[J+Num/2]; // and P[conj(J)]
	ElemType QJ = PJ; // New values of P[J]
	ElemType QC = PC; // and P[conj(J)]
	if (T[J][Pets] == Switch)
	QC = PJ;
	if (T[J+Num/2][Pets] == Switch)
	QJ = PC;
	P[J] = QJ;
	P[J+Num/2] = QC;
	}

	for (ElemType J = 0; J != Num; ++J)
	if (P[J] != Ignore && P[J] >= Num/2)
	P[J] -= Num/2;

	if (Step+1 < Log) {
	if (UseUp && !route(P, T, Size/2, Step+1))
	return false;
	if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
	return false;
	}
	return true;
	}

	// --------------------------------------------------------------------
	// Support for building selection results (output instructions that are
	// parts of the final selection).

	namespace {
	struct OpRef {
	OpRef(SDValue V) : OpV(V) {}
	bool isValue() const { return OpV.getNode() != nullptr; }
	bool isValid() const { return isValue() \|\| !(OpN & Invalid); }
	static OpRef res(int N) { return OpRef(Whole \| (N & Index)); }
	static OpRef fail() { return OpRef(Invalid); }

	static OpRef lo(const OpRef &R) {
	assert(!R.isValue());
	return OpRef(R.OpN & (Undef \| Index \| LoHalf));
	}
	static OpRef hi(const OpRef &R) {
	assert(!R.isValue());
	return OpRef(R.OpN & (Undef \| Index \| HiHalf));
	}
	static OpRef undef(MVT Ty) { return OpRef(Undef \| Ty.SimpleTy); }

	// Direct value.
	SDValue OpV = SDValue();

	// Reference to the operand of the input node:
	// If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the
	// operand index:
	// If bit 30 is set, it's the high half of the operand.
	// If bit 29 is set, it's the low half of the operand.
	unsigned OpN = 0;

	enum : unsigned {
	Invalid = 0x10000000,
	LoHalf = 0x20000000,
	HiHalf = 0x40000000,
	Whole = LoHalf \| HiHalf,
	Undef = 0x80000000,
	Index = 0x0FFFFFFF, // Mask of the index value.
	IndexBits = 28,
	};

	void print(raw_ostream &OS, const SelectionDAG &G) const;

	private:
	OpRef(unsigned N) : OpN(N) {}
	};

	struct NodeTemplate {
	NodeTemplate() = default;
	unsigned Opc = 0;
	MVT Ty = MVT::Other;
	std::vector<OpRef> Ops;

	void print(raw_ostream &OS, const SelectionDAG &G) const;
	};

	struct ResultStack {
	ResultStack(SDNode *Inp)
	: InpNode(Inp), InpTy(Inp->getValueType(0).getSimpleVT()) {}
	SDNode *InpNode;
	MVT InpTy;
	unsigned push(const NodeTemplate &Res) {
	List.push_back(Res);
	return List.size()-1;
	}
	unsigned push(unsigned Opc, MVT Ty, std::vector<OpRef> &&Ops) {
	NodeTemplate Res;
	Res.Opc = Opc;
	Res.Ty = Ty;
	Res.Ops = Ops;
	return push(Res);
	}
	bool empty() const { return List.empty(); }
	unsigned size() const { return List.size(); }
	unsigned top() const { return size()-1; }
	const NodeTemplate &operator[](unsigned I) const { return List[I]; }
	unsigned reset(unsigned NewTop) {
	List.resize(NewTop+1);
	return NewTop;
	}

	using BaseType = std::vector<NodeTemplate>;
	BaseType::iterator begin() { return List.begin(); }
	BaseType::iterator end() { return List.end(); }
	BaseType::const_iterator begin() const { return List.begin(); }
	BaseType::const_iterator end() const { return List.end(); }

	BaseType List;

	void print(raw_ostream &OS, const SelectionDAG &G) const;
	};
	} // namespace

	void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const {
	if (isValue()) {
	OpV.getNode()->print(OS, &G);
	return;
	}
	if (OpN & Invalid) {
	OS << "invalid";
	return;
	}
	if (OpN & Undef) {
	OS << "undef";
	return;
	}
	if ((OpN & Whole) != Whole) {
	assert((OpN & Whole) == LoHalf \|\| (OpN & Whole) == HiHalf);
	if (OpN & LoHalf)
	OS << "lo ";
	else
	OS << "hi ";
	}
	OS << '#' << SignExtend32(OpN & Index, IndexBits);
	}

	void NodeTemplate::print(raw_ostream &OS, const SelectionDAG &G) const {
	const TargetInstrInfo &TII = *G.getSubtarget().getInstrInfo();
	OS << format("%8s", EVT(Ty).getEVTString().c_str()) << " "
	<< TII.getName(Opc);
	bool Comma = false;
	for (const auto &R : Ops) {
	if (Comma)
	OS << ',';
	Comma = true;
	OS << ' ';
	R.print(OS, G);
	}
	}

	void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
	OS << "Input node:\n";
	#ifndef NDEBUG
	InpNode->dumpr(&G);
	#endif
	OS << "Result templates:\n";
	for (unsigned I = 0, E = List.size(); I != E; ++I) {
	OS << '[' << I << "] ";
	List[I].print(OS, G);
	OS << '\n';
	}
	}

	namespace {
	struct ShuffleMask {
	ShuffleMask(ArrayRef<int> M) : Mask(M) {
	for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
	int M = Mask[I];
	if (M == -1)
	continue;
	MinSrc = (MinSrc == -1) ? M : std::min(MinSrc, M);
	MaxSrc = (MaxSrc == -1) ? M : std::max(MaxSrc, M);
	}
	}

	ArrayRef<int> Mask;
	int MinSrc = -1, MaxSrc = -1;

	ShuffleMask lo() const {
	size_t H = Mask.size()/2;
	return ShuffleMask(Mask.take_front(H));
	}
	ShuffleMask hi() const {
	size_t H = Mask.size()/2;
	return ShuffleMask(Mask.take_back(H));
	}
	};
	} // namespace

	// --------------------------------------------------------------------
	// The HvxSelector class.

	static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) {
	return static_cast<const HexagonTargetLowering&>(G.getTargetLoweringInfo());
	}
	static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) {
	return static_cast<const HexagonSubtarget&>(G.getSubtarget());
	}

	namespace llvm {
	struct HvxSelector {
	const HexagonTargetLowering &Lower;
	HexagonDAGToDAGISel &ISel;
	SelectionDAG &DAG;
	const HexagonSubtarget &HST;
	const unsigned HwLen;

	HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G)
	: Lower(getHexagonLowering(G)), ISel(HS), DAG(G),
	HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {}

	MVT getSingleVT(MVT ElemTy) const {
	unsigned NumElems = HwLen / (ElemTy.getSizeInBits()/8);
	return MVT::getVectorVT(ElemTy, NumElems);
	}

	MVT getPairVT(MVT ElemTy) const {
	unsigned NumElems = (2*HwLen) / (ElemTy.getSizeInBits()/8);
	return MVT::getVectorVT(ElemTy, NumElems);
	}

	void selectShuffle(SDNode *N);
	void selectRor(SDNode *N);

	private:
	void materialize(const ResultStack &Results);

	SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);

	enum : unsigned {
	None,
	PackMux,
	};
	OpRef concat(OpRef Va, OpRef Vb, ResultStack &Results);
	OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
	MutableArrayRef<int> NewMask, unsigned Options = None);
	OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
	MutableArrayRef<int> NewMask);
	- OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results);
	OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
	ResultStack &Results);
	OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
	ResultStack &Results);

	OpRef shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results);
	OpRef shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
	OpRef shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results);
	OpRef shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);

	OpRef butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results);
	OpRef contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
	OpRef expanding(ShuffleMask SM, OpRef Va, ResultStack &Results);
	OpRef perfect(ShuffleMask SM, OpRef Va, ResultStack &Results);

	bool selectVectorConstants(SDNode *N);
	bool scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl, MVT ResTy,
	SDValue Va, SDValue Vb, SDNode *N);

	};
	}

	static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,
	MutableArrayRef<int> MaskR) {
	unsigned VecLen = Mask.size();
	assert(MaskL.size() == VecLen && MaskR.size() == VecLen);
	for (unsigned I = 0; I != VecLen; ++I) {
	int M = Mask[I];
	if (M < 0) {
	MaskL[I] = MaskR[I] = -1;
	} else if (unsigned(M) < VecLen) {
	MaskL[I] = M;
	MaskR[I] = -1;
	} else {
	MaskL[I] = -1;
	MaskR[I] = M-VecLen;
	}
	}
	}

	static std::pair<int,unsigned> findStrip(ArrayRef<int> A, int Inc,
	unsigned MaxLen) {
	assert(A.size() > 0 && A.size() >= MaxLen);
	int F = A[0];
	int E = F;
	for (unsigned I = 1; I != MaxLen; ++I) {
	if (A[I] - E != Inc)
	return { F, I };
	E = A[I];
	}
	return { F, MaxLen };
	}

	static bool isUndef(ArrayRef<int> Mask) {
	for (int Idx : Mask)
	if (Idx != -1)
	return false;
	return true;
	}

	static bool isIdentity(ArrayRef<int> Mask) {
	for (int I = 0, E = Mask.size(); I != E; ++I) {
	int M = Mask[I];
	if (M >= 0 && M != I)
	return false;
	}
	return true;
	}

	static bool isPermutation(ArrayRef<int> Mask) {
	// Check by adding all numbers only works if there is no overflow.
	assert(Mask.size() < 0x00007FFF && "Sanity failure");
	int Sum = 0;
	for (int Idx : Mask) {
	if (Idx == -1)
	return false;
	Sum += Idx;
	}
	int N = Mask.size();
	return 2Sum == N(N-1);
	}

	bool HvxSelector::selectVectorConstants(SDNode *N) {
	// Constant vectors are generated as loads from constant pools.
	// Since they are generated during the selection process, the main
	// selection algorithm is not aware of them. Select them directly
	// here.
	SmallVector<SDNode*,4> Loads;
	SmallVector<SDNode*,16> WorkQ;

	// The DAG can change (due to CSE) during selection, so cache all the
	// unselected nodes first to avoid traversing a mutating DAG.

	auto IsLoadToSelect = [] (SDNode *N) {
	if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) {
	SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
	unsigned AddrOpc = Addr.getOpcode();
	if (AddrOpc == HexagonISD::AT_PCREL \|\| AddrOpc == HexagonISD::CP)
	if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
	return true;
	}
	return false;
	};

	WorkQ.push_back(N);
	for (unsigned i = 0; i != WorkQ.size(); ++i) {
	SDNode *W = WorkQ[i];
	if (IsLoadToSelect(W)) {
	Loads.push_back(W);
	continue;
	}
	for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
	WorkQ.push_back(W->getOperand(j).getNode());
	}

	for (SDNode *L : Loads)
	ISel.Select(L);

	return !Loads.empty();
	}

	void HvxSelector::materialize(const ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {
	dbgs() << "Materializing\n";
	Results.print(dbgs(), DAG);
	});
	if (Results.empty())
	return;
	const SDLoc &dl(Results.InpNode);
	std::vector<SDValue> Output;

	for (unsigned I = 0, E = Results.size(); I != E; ++I) {
	const NodeTemplate &Node = Results[I];
	std::vector<SDValue> Ops;
	for (const OpRef &R : Node.Ops) {
	assert(R.isValid());
	if (R.isValue()) {
	Ops.push_back(R.OpV);
	continue;
	}
	if (R.OpN & OpRef::Undef) {
	MVT::SimpleValueType SVT = MVT::SimpleValueType(R.OpN & OpRef::Index);
	Ops.push_back(ISel.selectUndef(dl, MVT(SVT)));
	continue;
	}
	// R is an index of a result.
	unsigned Part = R.OpN & OpRef::Whole;
	int Idx = SignExtend32(R.OpN & OpRef::Index, OpRef::IndexBits);
	if (Idx < 0)
	Idx += I;
	assert(Idx >= 0 && unsigned(Idx) < Output.size());
	SDValue Op = Output[Idx];
	MVT OpTy = Op.getValueType().getSimpleVT();
	if (Part != OpRef::Whole) {
	assert(Part == OpRef::LoHalf \|\| Part == OpRef::HiHalf);
	if (Op.getOpcode() == HexagonISD::VCOMBINE) {
	Op = (Part == OpRef::HiHalf) ? Op.getOperand(0) : Op.getOperand(1);
	} else {
	MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
	OpTy.getVectorNumElements()/2);
	unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
	: Hexagon::vsub_hi;
	Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
	}
	}
	Ops.push_back(Op);
	} // for (Node : Results)

	assert(Node.Ty != MVT::Other);
	SDNode *ResN = (Node.Opc == TargetOpcode::COPY)
	? Ops.front().getNode()
	: DAG.getMachineNode(Node.Opc, dl, Node.Ty, Ops);
	Output.push_back(SDValue(ResN, 0));
	}

	SDNode *OutN = Output.back().getNode();
	SDNode *InpN = Results.InpNode;
	DEBUG_WITH_TYPE("isel", {
	dbgs() << "Generated node:\n";
	OutN->dumpr(&DAG);
	});

	ISel.ReplaceNode(InpN, OutN);
	selectVectorConstants(OutN);
	DAG.RemoveDeadNodes();
	}

	OpRef HvxSelector::concat(OpRef Lo, OpRef Hi, ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	const SDLoc &dl(Results.InpNode);
	Results.push(TargetOpcode::REG_SEQUENCE, getPairVT(MVT::i8), {
	DAG.getTargetConstant(Hexagon::HvxWRRegClassID, dl, MVT::i32),
	Lo, DAG.getTargetConstant(Hexagon::vsub_lo, dl, MVT::i32),
	Hi, DAG.getTargetConstant(Hexagon::vsub_hi, dl, MVT::i32),
	});
	return OpRef::res(Results.top());
	}

	// Va, Vb are single vectors, SM can be arbitrarily long.
	OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
	ResultStack &Results, MutableArrayRef<int> NewMask,
	unsigned Options) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	if (!Va.isValid() \|\| !Vb.isValid())
	return OpRef::fail();

	int VecLen = SM.Mask.size();
	MVT Ty = getSingleVT(MVT::i8);

	if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
	if (SM.MaxSrc < int(HwLen)) {
	memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
	return Va;
	}
	if (SM.MinSrc >= int(HwLen)) {
	for (int I = 0; I != VecLen; ++I) {
	int M = SM.Mask[I];
	if (M != -1)
	M -= HwLen;
	NewMask[I] = M;
	}
	return Vb;
	}
	const SDLoc &dl(Results.InpNode);
	SDValue S = DAG.getTargetConstant(SM.MinSrc, dl, MVT::i32);
	if (isUInt<3>(SM.MinSrc)) {
	Results.push(Hexagon::V6_valignbi, Ty, {Vb, Va, S});
	} else {
	Results.push(Hexagon::A2_tfrsi, MVT::i32, {S});
	unsigned Top = Results.top();
	Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(Top)});
	}
	for (int I = 0; I != VecLen; ++I) {
	int M = SM.Mask[I];
	if (M != -1)
	M -= SM.MinSrc;
	NewMask[I] = M;
	}
	return OpRef::res(Results.top());
	}

	if (Options & PackMux) {
	// If elements picked from Va and Vb have all different (source) indexes
	// (relative to the start of the argument), do a mux, and update the mask.
	BitVector Picked(HwLen);
	SmallVector<uint8_t,128> MuxBytes(HwLen);
	bool CanMux = true;
	for (int I = 0; I != VecLen; ++I) {
	int M = SM.Mask[I];
	if (M == -1)
	continue;
	if (M >= int(HwLen))
	M -= HwLen;
	else
	MuxBytes[M] = 0xFF;
	if (Picked[M]) {
	CanMux = false;
	break;
	}
	NewMask[I] = M;
	}
	if (CanMux)
	return vmuxs(MuxBytes, Va, Vb, Results);
	}

	return OpRef::fail();
	}

	OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
	ResultStack &Results, MutableArrayRef<int> NewMask) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	unsigned HalfMask = 0;
	unsigned LogHw = Log2_32(HwLen);
	for (int M : SM.Mask) {
	if (M == -1)
	continue;
	HalfMask \|= (1u << (M >> LogHw));
	}

	if (HalfMask == 0)
	return OpRef::undef(getPairVT(MVT::i8));

	// If more than two halves are used, bail.
	// TODO: be more aggressive here?
	if (countPopulation(HalfMask) > 2)
	return OpRef::fail();

	MVT HalfTy = getSingleVT(MVT::i8);

	OpRef Inp[2] = { Va, Vb };
	OpRef Out[2] = { OpRef::undef(HalfTy), OpRef::undef(HalfTy) };

	uint8_t HalfIdx[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
	unsigned Idx = 0;
	for (unsigned I = 0; I != 4; ++I) {
	if ((HalfMask & (1u << I)) == 0)
	continue;
	assert(Idx < 2);
	OpRef Op = Inp[I/2];
	Out[Idx] = (I & 1) ? OpRef::hi(Op) : OpRef::lo(Op);
	HalfIdx[I] = Idx++;
	}

	int VecLen = SM.Mask.size();
	for (int I = 0; I != VecLen; ++I) {
	int M = SM.Mask[I];
	if (M >= 0) {
	uint8_t Idx = HalfIdx[M >> LogHw];
	assert(Idx == 0 \|\| Idx == 1);
	M = (M & (HwLen-1)) + HwLen*Idx;
	}
	NewMask[I] = M;
	}

	return concat(Out[0], Out[1], Results);
	-}
	-
	-OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) {
	- DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	-
	- int VecLen = SM.Mask.size();
	- SmallVector<uint8_t,128> UsedBytes(VecLen);
	- bool HasUnused = false;
	- for (int I = 0; I != VecLen; ++I) {
	- if (SM.Mask[I] != -1)
	- UsedBytes[I] = 0xFF;
	- else
	- HasUnused = true;
	- }
	- if (!HasUnused)
	- return Va;
	- SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode));
	- Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)});
	- return OpRef::res(Results.top());
	}

	OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
	ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	MVT ByteTy = getSingleVT(MVT::i8);
	MVT BoolTy = MVT::getVectorVT(MVT::i1, 8*HwLen); // XXX
	const SDLoc &dl(Results.InpNode);
	SDValue B = getVectorConstant(Bytes, dl);
	Results.push(Hexagon::V6_vd0, ByteTy, {});
	Results.push(Hexagon::V6_veqb, BoolTy, {OpRef(B), OpRef::res(-1)});
	Results.push(Hexagon::V6_vmux, ByteTy, {OpRef::res(-1), Vb, Va});
	return OpRef::res(Results.top());
	}

	OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
	ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	size_t S = Bytes.size() / 2;
	OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results);
	OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results);
	return concat(L, H, Results);
	}

	OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	unsigned VecLen = SM.Mask.size();
	assert(HwLen == VecLen);
	(void)VecLen;
	assert(all_of(SM.Mask, [this](int M) { return M == -1 \|\| M < int(HwLen); }));

	if (isIdentity(SM.Mask))
	return Va;
	if (isUndef(SM.Mask))
	return OpRef::undef(getSingleVT(MVT::i8));

	OpRef P = perfect(SM, Va, Results);
	if (P.isValid())
	return P;
	return butterfly(SM, Va, Results);
	}

	OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb,
	ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	if (isUndef(SM.Mask))
	return OpRef::undef(getSingleVT(MVT::i8));

	OpRef C = contracting(SM, Va, Vb, Results);
	if (C.isValid())
	return C;

	int VecLen = SM.Mask.size();
	SmallVector<int,128> NewMask(VecLen);
	OpRef P = packs(SM, Va, Vb, Results, NewMask);
	if (P.isValid())
	return shuffs1(ShuffleMask(NewMask), P, Results);

	SmallVector<int,128> MaskL(VecLen), MaskR(VecLen);
	splitMask(SM.Mask, MaskL, MaskR);

	OpRef L = shuffs1(ShuffleMask(MaskL), Va, Results);
	OpRef R = shuffs1(ShuffleMask(MaskR), Vb, Results);
	if (!L.isValid() \|\| !R.isValid())
	return OpRef::fail();

	SmallVector<uint8_t,128> Bytes(VecLen);
	for (int I = 0; I != VecLen; ++I) {
	if (MaskL[I] != -1)
	Bytes[I] = 0xFF;
	}
	return vmuxs(Bytes, L, R, Results);
	}

	OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	int VecLen = SM.Mask.size();

	if (isIdentity(SM.Mask))
	return Va;
	if (isUndef(SM.Mask))
	return OpRef::undef(getPairVT(MVT::i8));

	SmallVector<int,128> PackedMask(VecLen);
	OpRef P = packs(SM, OpRef::lo(Va), OpRef::hi(Va), Results, PackedMask);
	if (P.isValid()) {
	ShuffleMask PM(PackedMask);
	OpRef E = expanding(PM, P, Results);
	if (E.isValid())
	return E;

	OpRef L = shuffs1(PM.lo(), P, Results);
	OpRef H = shuffs1(PM.hi(), P, Results);
	if (L.isValid() && H.isValid())
	return concat(L, H, Results);
	}

	OpRef R = perfect(SM, Va, Results);
	if (R.isValid())
	return R;
	// TODO commute the mask and try the opposite order of the halves.

	OpRef L = shuffs2(SM.lo(), OpRef::lo(Va), OpRef::hi(Va), Results);
	OpRef H = shuffs2(SM.hi(), OpRef::lo(Va), OpRef::hi(Va), Results);
	if (L.isValid() && H.isValid())
	return concat(L, H, Results);

	return OpRef::fail();
	}

	OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
	ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	if (isUndef(SM.Mask))
	return OpRef::undef(getPairVT(MVT::i8));

	int VecLen = SM.Mask.size();
	SmallVector<int,256> PackedMask(VecLen);
	OpRef P = packp(SM, Va, Vb, Results, PackedMask);
	if (P.isValid())
	return shuffp1(ShuffleMask(PackedMask), P, Results);

	SmallVector<int,256> MaskL(VecLen), MaskR(VecLen);
	OpRef L = shuffp1(ShuffleMask(MaskL), Va, Results);
	OpRef R = shuffp1(ShuffleMask(MaskR), Vb, Results);
	if (!L.isValid() \|\| !R.isValid())
	return OpRef::fail();

	// Mux the results.
	SmallVector<uint8_t,256> Bytes(VecLen);
	for (int I = 0; I != VecLen; ++I) {
	if (MaskL[I] != -1)
	Bytes[I] = 0xFF;
	}
	return vmuxp(Bytes, L, R, Results);
	}

	bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
	MVT ResTy, SDValue Va, SDValue Vb,
	SDNode *N) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	MVT ElemTy = ResTy.getVectorElementType();
	assert(ElemTy == MVT::i8);
	unsigned VecLen = Mask.size();
	bool HavePairs = (2*HwLen == VecLen);
	MVT SingleTy = getSingleVT(MVT::i8);

	SmallVector<SDValue,128> Ops;
	for (int I : Mask) {
	if (I < 0) {
	Ops.push_back(ISel.selectUndef(dl, ElemTy));
	continue;
	}
	SDValue Vec;
	unsigned M = I;
	if (M < VecLen) {
	Vec = Va;
	} else {
	Vec = Vb;
	M -= VecLen;
	}
	if (HavePairs) {
	if (M < HwLen) {
	Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, Vec);
	} else {
	Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, Vec);
	M -= HwLen;
	}
	}
	SDValue Idx = DAG.getConstant(M, dl, MVT::i32);
	SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemTy, {Vec, Idx});
	SDValue L = Lower.LowerOperation(Ex, DAG);
	assert(L.getNode());
	Ops.push_back(L);
	}

	SDValue LV;
	if (2*HwLen == VecLen) {
	SDValue B0 = DAG.getBuildVector(SingleTy, dl, {Ops.data(), HwLen});
	SDValue L0 = Lower.LowerOperation(B0, DAG);
	SDValue B1 = DAG.getBuildVector(SingleTy, dl, {Ops.data()+HwLen, HwLen});
	SDValue L1 = Lower.LowerOperation(B1, DAG);
	// XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering)
	// functions may expect to be called only for illegal operations, so
	// make sure that they are not called for legal ones. Develop a better
	// mechanism for dealing with this.
	LV = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, {L0, L1});
	} else {
	SDValue BV = DAG.getBuildVector(ResTy, dl, Ops);
	LV = Lower.LowerOperation(BV, DAG);
	}

	assert(!N->use_empty());
	ISel.ReplaceNode(N, LV.getNode());
	DAG.RemoveDeadNodes();

	std::deque<SDNode*> SubNodes;
	SubNodes.push_back(LV.getNode());
	for (unsigned I = 0; I != SubNodes.size(); ++I) {
	for (SDValue Op : SubNodes[I]->ops())
	SubNodes.push_back(Op.getNode());
	}
	while (!SubNodes.empty()) {
	SDNode *S = SubNodes.front();
	SubNodes.pop_front();
	if (S->use_empty())
	continue;
	// This isn't great, but users need to be selected before any nodes that
	// they use. (The reason is to match larger patterns, and avoid nodes that
	// cannot be matched on their own, e.g. ValueType, TokenFactor, etc.).
	bool PendingUser = llvm::any_of(S->uses(), [&SubNodes](const SDNode *U) {
	return llvm::any_of(SubNodes, [U](const SDNode *T) {
	return T == U;
	});
	});
	if (PendingUser)
	SubNodes.push_back(S);
	else
	ISel.Select(S);
	}

	DAG.RemoveDeadNodes();
	return true;
	}

	OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
	ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	if (!Va.isValid() \|\| !Vb.isValid())
	return OpRef::fail();

	// Contracting shuffles, i.e. instructions that always discard some bytes
	// from the operand vectors.
	//
	// V6_vshuff{e,o}b
	// V6_vdealb4w
	// V6_vpack{e,o}{b,h}

	int VecLen = SM.Mask.size();
	std::pair<int,unsigned> Strip = findStrip(SM.Mask, 1, VecLen);
	MVT ResTy = getSingleVT(MVT::i8);

	// The following shuffles only work for bytes and halfwords. This requires
	// the strip length to be 1 or 2.
	if (Strip.second != 1 && Strip.second != 2)
	return OpRef::fail();

	// The patterns for the shuffles, in terms of the starting offsets of the
	// consecutive strips (L = length of the strip, N = VecLen):
	//
	// vpacke: 0, 2L, 4L ... N+0, N+2L, N+4L ... L = 1 or 2
	// vpacko: L, 3L, 5L ... N+L, N+3L, N+5L ... L = 1 or 2
	//
	// vshuffe: 0, N+0, 2L, N+2L, 4L ... L = 1 or 2
	// vshuffo: L, N+L, 3L, N+3L, 5L ... L = 1 or 2
	//
	// vdealb4w: 0, 4, 8 ... 2, 6, 10 ... N+0, N+4, N+8 ... N+2, N+6, N+10 ...

	// The value of the element in the mask following the strip will decide
	// what kind of a shuffle this can be.
	int NextInMask = SM.Mask[Strip.second];

	// Check if NextInMask could be 2L, 3L or 4, i.e. if it could be a mask
	// for vpack or vdealb4w. VecLen > 4, so NextInMask for vdealb4w would
	// satisfy this.
	if (NextInMask < VecLen) {
	// vpack{e,o} or vdealb4w
	if (Strip.first == 0 && Strip.second == 1 && NextInMask == 4) {
	int N = VecLen;
	// Check if this is vdealb4w (L=1).
	for (int I = 0; I != N/4; ++I)
	if (SM.Mask[I] != 4*I)
	return OpRef::fail();
	for (int I = 0; I != N/4; ++I)
	if (SM.Mask[I+N/4] != 2 + 4*I)
	return OpRef::fail();
	for (int I = 0; I != N/4; ++I)
	if (SM.Mask[I+N/2] != N + 4*I)
	return OpRef::fail();
	for (int I = 0; I != N/4; ++I)
	if (SM.Mask[I+3N/4] != N+2 + 4I)
	return OpRef::fail();
	// Matched mask for vdealb4w.
	Results.push(Hexagon::V6_vdealb4w, ResTy, {Vb, Va});
	return OpRef::res(Results.top());
	}

	// Check if this is vpack{e,o}.
	int N = VecLen;
	int L = Strip.second;
	// Check if the first strip starts at 0 or at L.
	if (Strip.first != 0 && Strip.first != L)
	return OpRef::fail();
	// Examine the rest of the mask.
	for (int I = L; I < N; I += L) {
	auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
	// Check whether the mask element at the beginning of each strip
	// increases by 2L each time.
	if (S.first - Strip.first != 2*I)
	return OpRef::fail();
	// Check whether each strip is of the same length.
	if (S.second != unsigned(L))
	return OpRef::fail();
	}

	// Strip.first == 0 => vpacke
	// Strip.first == L => vpacko
	assert(Strip.first == 0 \|\| Strip.first == L);
	using namespace Hexagon;
	NodeTemplate Res;
	Res.Opc = Strip.second == 1 // Number of bytes.
	? (Strip.first == 0 ? V6_vpackeb : V6_vpackob)
	: (Strip.first == 0 ? V6_vpackeh : V6_vpackoh);
	Res.Ty = ResTy;
	Res.Ops = { Vb, Va };
	Results.push(Res);
	return OpRef::res(Results.top());
	}

	// Check if this is vshuff{e,o}.
	int N = VecLen;
	int L = Strip.second;
	std::pair<int,unsigned> PrevS = Strip;
	bool Flip = false;
	for (int I = L; I < N; I += L) {
	auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
	if (S.second != PrevS.second)
	return OpRef::fail();
	int Diff = Flip ? PrevS.first - S.first + 2*L
	: S.first - PrevS.first;
	if (Diff != N)
	return OpRef::fail();
	Flip ^= true;
	PrevS = S;
	}
	// Strip.first == 0 => vshuffe
	// Strip.first == L => vshuffo
	assert(Strip.first == 0 \|\| Strip.first == L);
	using namespace Hexagon;
	NodeTemplate Res;
	Res.Opc = Strip.second == 1 // Number of bytes.
	? (Strip.first == 0 ? V6_vshuffeb : V6_vshuffob)
	: (Strip.first == 0 ? V6_vshufeh : V6_vshufoh);
	Res.Ty = ResTy;
	Res.Ops = { Vb, Va };
	Results.push(Res);
	return OpRef::res(Results.top());
	}

	OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	// Expanding shuffles (using all elements and inserting into larger vector):
	//
	// V6_vunpacku{b,h} [*]
	//
	// [*] Only if the upper elements (filled with 0s) are "don't care" in Mask.
	//
	// Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so
	// they are not shuffles.
	//
	// The argument is a single vector.

	int VecLen = SM.Mask.size();
	assert(2*HwLen == unsigned(VecLen) && "Expecting vector-pair type");

	std::pair<int,unsigned> Strip = findStrip(SM.Mask, 1, VecLen);

	// The patterns for the unpacks, in terms of the starting offsets of the
	// consecutive strips (L = length of the strip, N = VecLen):
	//
	// vunpacku: 0, -1, L, -1, 2L, -1 ...

	if (Strip.first != 0)
	return OpRef::fail();

	// The vunpackus only handle byte and half-word.
	if (Strip.second != 1 && Strip.second != 2)
	return OpRef::fail();

	int N = VecLen;
	int L = Strip.second;

	// First, check the non-ignored strips.
	for (int I = 2L; I < 2N; I += 2*L) {
	auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
	if (S.second != unsigned(L))
	return OpRef::fail();
	if (2*S.first != I)
	return OpRef::fail();
	}
	// Check the -1s.
	for (int I = L; I < 2N; I += 2L) {
	auto S = findStrip(SM.Mask.drop_front(I), 0, N-I);
	if (S.first != -1 \|\| S.second != unsigned(L))
	return OpRef::fail();
	}

	unsigned Opc = Strip.second == 1 ? Hexagon::V6_vunpackub
	: Hexagon::V6_vunpackuh;
	Results.push(Opc, getPairVT(MVT::i8), {Va});
	return OpRef::res(Results.top());
	}

	OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	// V6_vdeal{b,h}
	// V6_vshuff{b,h}

	// V6_vshufoe{b,h} those are quivalent to vshuffvdd(..,{1,2})
	// V6_vshuffvdd (V6_vshuff)
	// V6_dealvdd (V6_vdeal)

	int VecLen = SM.Mask.size();
	assert(isPowerOf2_32(VecLen) && Log2_32(VecLen) <= 8);
	unsigned LogLen = Log2_32(VecLen);
	unsigned HwLog = Log2_32(HwLen);
	// The result length must be the same as the length of a single vector,
	// or a vector pair.
	assert(LogLen == HwLog \|\| LogLen == HwLog+1);
	bool Extend = (LogLen == HwLog);

	if (!isPermutation(SM.Mask))
	return OpRef::fail();

	SmallVector<unsigned,8> Perm(LogLen);

	// Check if this could be a perfect shuffle, or a combination of perfect
	// shuffles.
	//
	// Consider this permutation (using hex digits to make the ASCII diagrams
	// easier to read):
	// { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }.
	// This is a "deal" operation: divide the input into two halves, and
	// create the output by picking elements by alternating between these two
	// halves:
	// 0 1 2 3 4 5 6 7 --> 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F [*]
	// 8 9 A B C D E F
	//
	// Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides
	// a somwehat different mechanism that could be used to perform shuffle/
	// deal operations: a 2x2 transpose.
	// Consider the halves of inputs again, they can be interpreted as a 2x8
	// matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated
	// together. Now, when considering 2 elements at a time, it will be a 2x4
	// matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices:
	// 01 23 45 67
	// 89 AB CD EF
	// With groups of 4, this will become a single 2x2 matrix, and so on.
	//
	// The 2x2 transpose instruction works by transposing each of the 2x2
	// matrices (or "sub-matrices"), given a specific group size. For example,
	// if the group size is 1 (i.e. each element is its own group), there
	// will be four transposes of the four 2x2 matrices that form the 2x8.
	// For example, with the inputs as above, the result will be:
	// 0 8 2 A 4 C 6 E
	// 1 9 3 B 5 D 7 F
	// Now, this result can be tranposed again, but with the group size of 2:
	// 08 19 4C 5D
	// 2A 3B 6E 7F
	// If we then transpose that result, but with the group size of 4, we get:
	// 0819 2A3B
	// 4C5D 6E7F
	// If we concatenate these two rows, it will be
	// 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F
	// which is the same as the "deal" [*] above.
	//
	// In general, a "deal" of individual elements is a series of 2x2 transposes,
	// with changing group size. HVX has two instructions:
	// Vdd = V6_vdealvdd Vu, Vv, Rt
	// Vdd = V6_shufvdd Vu, Vv, Rt
	// that perform exactly that. The register Rt controls which transposes are
	// going to happen: a bit at position n (counting from 0) indicates that a
	// transpose with a group size of 2^n will take place. If multiple bits are
	// set, multiple transposes will happen: vdealvdd will perform them starting
	// with the largest group size, vshuffvdd will do them in the reverse order.
	//
	// The main observation is that each 2x2 transpose corresponds to swapping
	// columns of bits in the binary representation of the values.
	//
	// The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits
	// in a given column. The * denote the columns that will be swapped.
	// The transpose with the group size 2^n corresponds to swapping columns
	// 3 (the highest log) and log2(n):
	//
	// 3 2 1 0 0 2 1 3 0 2 3 1
	// * * * * * *
	// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	// 1 0 0 0 1 8 1 0 0 0 8 1 0 0 0 8 1 0 0 0
	// 2 0 0 1 0 2 0 0 1 0 1 0 0 0 1 1 0 0 0 1
	// 3 0 0 1 1 A 1 0 1 0 9 1 0 0 1 9 1 0 0 1
	// 4 0 1 0 0 4 0 1 0 0 4 0 1 0 0 2 0 0 1 0
	// 5 0 1 0 1 C 1 1 0 0 C 1 1 0 0 A 1 0 1 0
	// 6 0 1 1 0 6 0 1 1 0 5 0 1 0 1 3 0 0 1 1
	// 7 0 1 1 1 E 1 1 1 0 D 1 1 0 1 B 1 0 1 1
	// 8 1 0 0 0 1 0 0 0 1 2 0 0 1 0 4 0 1 0 0
	// 9 1 0 0 1 9 1 0 0 1 A 1 0 1 0 C 1 1 0 0
	// A 1 0 1 0 3 0 0 1 1 3 0 0 1 1 5 0 1 0 1
	// B 1 0 1 1 B 1 0 1 1 B 1 0 1 1 D 1 1 0 1
	// C 1 1 0 0 5 0 1 0 1 6 0 1 1 0 6 0 1 1 0
	// D 1 1 0 1 D 1 1 0 1 E 1 1 1 0 E 1 1 1 0
	// E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1
	// F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1

	auto XorPow2 = [] (ArrayRef<int> Mask, unsigned Num) {
	unsigned X = Mask[0] ^ Mask[Num/2];
	// Check that the first half has the X's bits clear.
	if ((Mask[0] & X) != 0)
	return 0u;
	for (unsigned I = 1; I != Num/2; ++I) {
	if (unsigned(Mask[I] ^ Mask[I+Num/2]) != X)
	return 0u;
	if ((Mask[I] & X) != 0)
	return 0u;
	}
	return X;
	};

	// Create a vector of log2's for each column: Perm[i] corresponds to
	// the i-th bit (lsb is 0).
	assert(VecLen > 2);
	for (unsigned I = VecLen; I >= 2; I >>= 1) {
	// Examine the initial segment of Mask of size I.
	unsigned X = XorPow2(SM.Mask, I);
	if (!isPowerOf2_32(X))
	return OpRef::fail();
	// Check the other segments of Mask.
	for (int J = I; J < VecLen; J += I) {
	if (XorPow2(SM.Mask.slice(J, I), I) != X)
	return OpRef::fail();
	}
	Perm[Log2_32(X)] = Log2_32(I)-1;
	}

	// Once we have Perm, represent it as cycles. Denote the maximum log2
	// (equal to log2(VecLen)-1) as M. The cycle containing M can then be
	// written as (M a1 a2 a3 ... an). That cycle can be broken up into
	// simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition
	// order being from left to right. Any (contiguous) segment where the
	// values ai, ai+1...aj are either all increasing or all decreasing,
	// can be implemented via a single vshuffvdd/vdealvdd respectively.
	//
	// If there is a cycle (a1 a2 ... an) that does not involve M, it can
	// be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can
	// then be folded to get (M a1 a2 ... an)(M a1), and the above procedure
	// can be used to generate a sequence of vshuffvdd/vdealvdd.
	//
	// Example:
	// Assume M = 4 and consider a permutation (0 1)(2 3). It can be written
	// as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply
	// (4 0 1)(4 0)(4 2 3)(4 2).
	// It can then be expanded into swaps as
	// (4 0)(4 1)(4 0)(4 2)(4 3)(4 2),
	// and broken up into "increasing" segments as
	// [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)].
	// This is equivalent to
	// (4 0 1)(4 0 2 3)(4 2),
	// which can be implemented as 3 vshufvdd instructions.

	using CycleType = SmallVector<unsigned,8>;
	std::set<CycleType> Cycles;
	std::set<unsigned> All;

	for (unsigned I : Perm)
	All.insert(I);

	// If the cycle contains LogLen-1, move it to the front of the cycle.
	// Otherwise, return the cycle unchanged.
	auto canonicalize = [LogLen](const CycleType &C) -> CycleType {
	unsigned LogPos, N = C.size();
	for (LogPos = 0; LogPos != N; ++LogPos)
	if (C[LogPos] == LogLen-1)
	break;
	if (LogPos == N)
	return C;

	CycleType NewC(C.begin()+LogPos, C.end());
	NewC.append(C.begin(), C.begin()+LogPos);
	return NewC;
	};

	auto pfs = [](const std::set<CycleType> &Cs, unsigned Len) {
	// Ordering: shuff: 5 0 1 2 3 4, deal: 5 4 3 2 1 0 (for Log=6),
	// for bytes zero is included, for halfwords is not.
	if (Cs.size() != 1)
	return 0u;
	const CycleType &C = *Cs.begin();
	if (C[0] != Len-1)
	return 0u;
	int D = Len - C.size();
	if (D != 0 && D != 1)
	return 0u;

	bool IsDeal = true, IsShuff = true;
	for (unsigned I = 1; I != Len-D; ++I) {
	if (C[I] != Len-1-I)
	IsDeal = false;
	if (C[I] != I-(1-D)) // I-1, I
	IsShuff = false;
	}
	// At most one, IsDeal or IsShuff, can be non-zero.
	assert(!(IsDeal \|\| IsShuff) \|\| IsDeal != IsShuff);
	static unsigned Deals[] = { Hexagon::V6_vdealb, Hexagon::V6_vdealh };
	static unsigned Shufs[] = { Hexagon::V6_vshuffb, Hexagon::V6_vshuffh };
	return IsDeal ? Deals[D] : (IsShuff ? Shufs[D] : 0);
	};

	while (!All.empty()) {
	unsigned A = *All.begin();
	All.erase(A);
	CycleType C;
	C.push_back(A);
	for (unsigned B = Perm[A]; B != A; B = Perm[B]) {
	C.push_back(B);
	All.erase(B);
	}
	if (C.size() <= 1)
	continue;
	Cycles.insert(canonicalize(C));
	}

	MVT SingleTy = getSingleVT(MVT::i8);
	MVT PairTy = getPairVT(MVT::i8);

	// Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}.
	if (unsigned(VecLen) == HwLen) {
	if (unsigned SingleOpc = pfs(Cycles, LogLen)) {
	Results.push(SingleOpc, SingleTy, {Va});
	return OpRef::res(Results.top());
	}
	}

	SmallVector<unsigned,8> SwapElems;
	if (HwLen == unsigned(VecLen))
	SwapElems.push_back(LogLen-1);

	for (const CycleType &C : Cycles) {
	unsigned First = (C[0] == LogLen-1) ? 1 : 0;
	SwapElems.append(C.begin()+First, C.end());
	if (First == 0)
	SwapElems.push_back(C[0]);
	}

	const SDLoc &dl(Results.InpNode);
	OpRef Arg = !Extend ? Va
	: concat(Va, OpRef::undef(SingleTy), Results);

	for (unsigned I = 0, E = SwapElems.size(); I != E; ) {
	bool IsInc = I == E-1 \|\| SwapElems[I] < SwapElems[I+1];
	unsigned S = (1u << SwapElems[I]);
	if (I < E-1) {
	while (++I < E-1 && IsInc == (SwapElems[I] < SwapElems[I+1]))
	S \|= 1u << SwapElems[I];
	// The above loop will not add a bit for the final SwapElems[I+1],
	// so add it here.
	S \|= 1u << SwapElems[I];
	}
	++I;

	NodeTemplate Res;
	Results.push(Hexagon::A2_tfrsi, MVT::i32,
	{ DAG.getTargetConstant(S, dl, MVT::i32) });
	Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd;
	Res.Ty = PairTy;
	Res.Ops = { OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1) };
	Results.push(Res);
	Arg = OpRef::res(Results.top());
	}

	return !Extend ? Arg : OpRef::lo(Arg);
	}

	OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
	DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
	// Butterfly shuffles.
	//
	// V6_vdelta
	// V6_vrdelta
	// V6_vror

	// The assumption here is that all elements picked by Mask are in the
	// first operand to the vector_shuffle. This assumption is enforced
	// by the caller.

	MVT ResTy = getSingleVT(MVT::i8);
	PermNetwork::Controls FC, RC;
	const SDLoc &dl(Results.InpNode);
	int VecLen = SM.Mask.size();

	for (int M : SM.Mask) {
	if (M != -1 && M >= VecLen)
	return OpRef::fail();
	}

	// Try the deltas/benes for both single vectors and vector pairs.
	ForwardDeltaNetwork FN(SM.Mask);
	if (FN.run(FC)) {
	SDValue Ctl = getVectorConstant(FC, dl);
	Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(Ctl)});
	return OpRef::res(Results.top());
	}

	// Try reverse delta.
	ReverseDeltaNetwork RN(SM.Mask);
	if (RN.run(RC)) {
	SDValue Ctl = getVectorConstant(RC, dl);
	Results.push(Hexagon::V6_vrdelta, ResTy, {Va, OpRef(Ctl)});
	return OpRef::res(Results.top());
	}

	// Do Benes.
	BenesNetwork BN(SM.Mask);
	if (BN.run(FC, RC)) {
	SDValue CtlF = getVectorConstant(FC, dl);
	SDValue CtlR = getVectorConstant(RC, dl);
	Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(CtlF)});
	Results.push(Hexagon::V6_vrdelta, ResTy,
	{OpRef::res(-1), OpRef(CtlR)});
	return OpRef::res(Results.top());
	}

	return OpRef::fail();
	}

	SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
	const SDLoc &dl) {
	SmallVector<SDValue, 128> Elems;
	for (uint8_t C : Data)
	Elems.push_back(DAG.getConstant(C, dl, MVT::i8));
	MVT VecTy = MVT::getVectorVT(MVT::i8, Data.size());
	SDValue BV = DAG.getBuildVector(VecTy, dl, Elems);
	SDValue LV = Lower.LowerOperation(BV, DAG);
	DAG.RemoveDeadNode(BV.getNode());
	return LV;
	}

	void HvxSelector::selectShuffle(SDNode *N) {
	DEBUG_WITH_TYPE("isel", {
	dbgs() << "Starting " << __func__ << " on node:\n";
	N->dump(&DAG);
	});
	MVT ResTy = N->getValueType(0).getSimpleVT();
	// Assume that vector shuffles operate on vectors of bytes.
	assert(ResTy.isVector() && ResTy.getVectorElementType() == MVT::i8);

	auto *SN = cast<ShuffleVectorSDNode>(N);
	std::vector<int> Mask(SN->getMask().begin(), SN->getMask().end());
	// This shouldn't really be necessary. Is it?
	for (int &Idx : Mask)
	if (Idx != -1 && Idx < 0)
	Idx = -1;

	unsigned VecLen = Mask.size();
	bool HavePairs = (2*HwLen == VecLen);
	assert(ResTy.getSizeInBits() / 8 == VecLen);

	// Vd = vector_shuffle Va, Vb, Mask
	//

	bool UseLeft = false, UseRight = false;
	for (unsigned I = 0; I != VecLen; ++I) {
	if (Mask[I] == -1)
	continue;
	unsigned Idx = Mask[I];
	assert(Idx < 2*VecLen);
	if (Idx < VecLen)
	UseLeft = true;
	else
	UseRight = true;
	}

	DEBUG_WITH_TYPE("isel", {
	dbgs() << "VecLen=" << VecLen << " HwLen=" << HwLen << " UseLeft="
	<< UseLeft << " UseRight=" << UseRight << " HavePairs="
	<< HavePairs << '\n';
	});
	// If the mask is all -1's, generate "undef".
	if (!UseLeft && !UseRight) {
	ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
	DAG.RemoveDeadNode(N);
	return;
	}

	SDValue Vec0 = N->getOperand(0);
	SDValue Vec1 = N->getOperand(1);
	ResultStack Results(SN);
	Results.push(TargetOpcode::COPY, ResTy, {Vec0});
	Results.push(TargetOpcode::COPY, ResTy, {Vec1});
	OpRef Va = OpRef::res(Results.top()-1);
	OpRef Vb = OpRef::res(Results.top());

	OpRef Res = !HavePairs ? shuffs2(ShuffleMask(Mask), Va, Vb, Results)
	: shuffp2(ShuffleMask(Mask), Va, Vb, Results);

	bool Done = Res.isValid();
	if (Done) {
	// Make sure that Res is on the stack before materializing.
	Results.push(TargetOpcode::COPY, ResTy, {Res});
	materialize(Results);
	} else {
	Done = scalarizeShuffle(Mask, SDLoc(N), ResTy, Vec0, Vec1, N);
	}

	if (!Done) {
	#ifndef NDEBUG
	dbgs() << "Unhandled shuffle:\n";
	SN->dumpr(&DAG);
	#endif
	llvm_unreachable("Failed to select vector shuffle");
	}
	}

	void HvxSelector::selectRor(SDNode *N) {
	// If this is a rotation by less than 8, use V6_valignbi.
	MVT Ty = N->getValueType(0).getSimpleVT();
	const SDLoc &dl(N);
	SDValue VecV = N->getOperand(0);
	SDValue RotV = N->getOperand(1);
	SDNode *NewN = nullptr;

	if (auto *CN = dyn_cast<ConstantSDNode>(RotV.getNode())) {
	unsigned S = CN->getZExtValue();
	if (S % HST.getVectorLength() == 0) {
	NewN = VecV.getNode();
	} else if (isUInt<3>(S)) {
	SDValue C = DAG.getTargetConstant(S, dl, MVT::i32);
	NewN = DAG.getMachineNode(Hexagon::V6_valignbi, dl, Ty,
	{VecV, VecV, C});
	}
	}

	if (!NewN)
	NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});

	ISel.ReplaceNode(N, NewN);
	DAG.RemoveDeadNode(N);
	}

	void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) {
	HvxSelector(this, CurDAG).selectShuffle(N);
	}

	void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
	HvxSelector(this, CurDAG).selectRor(N);
	}

	void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
	const SDLoc &dl(N);
	SDValue Chain = N->getOperand(0);
	SDValue Address = N->getOperand(2);
	SDValue Predicate = N->getOperand(3);
	SDValue Base = N->getOperand(4);
	SDValue Modifier = N->getOperand(5);
	SDValue Offset = N->getOperand(6);

	unsigned Opcode;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default:
	llvm_unreachable("Unexpected HVX gather intrinsic.");
	case Intrinsic::hexagon_V6_vgathermhq:
	case Intrinsic::hexagon_V6_vgathermhq_128B:
	Opcode = Hexagon::V6_vgathermhq_pseudo;
	break;
	case Intrinsic::hexagon_V6_vgathermwq:
	case Intrinsic::hexagon_V6_vgathermwq_128B:
	Opcode = Hexagon::V6_vgathermwq_pseudo;
	break;
	case Intrinsic::hexagon_V6_vgathermhwq:
	case Intrinsic::hexagon_V6_vgathermhwq_128B:
	Opcode = Hexagon::V6_vgathermhwq_pseudo;
	break;
	}

	SDVTList VTs = CurDAG->getVTList(MVT::Other);
	SDValue Ops[] = { Address, Predicate, Base, Modifier, Offset, Chain };
	SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);

	MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
	MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
	cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);

	ReplaceUses(N, Result);
	CurDAG->RemoveDeadNode(N);
	}

	void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
	const SDLoc &dl(N);
	SDValue Chain = N->getOperand(0);
	SDValue Address = N->getOperand(2);
	SDValue Base = N->getOperand(3);
	SDValue Modifier = N->getOperand(4);
	SDValue Offset = N->getOperand(5);

	unsigned Opcode;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default:
	llvm_unreachable("Unexpected HVX gather intrinsic.");
	case Intrinsic::hexagon_V6_vgathermh:
	case Intrinsic::hexagon_V6_vgathermh_128B:
	Opcode = Hexagon::V6_vgathermh_pseudo;
	break;
	case Intrinsic::hexagon_V6_vgathermw:
	case Intrinsic::hexagon_V6_vgathermw_128B:
	Opcode = Hexagon::V6_vgathermw_pseudo;
	break;
	case Intrinsic::hexagon_V6_vgathermhw:
	case Intrinsic::hexagon_V6_vgathermhw_128B:
	Opcode = Hexagon::V6_vgathermhw_pseudo;
	break;
	}

	SDVTList VTs = CurDAG->getVTList(MVT::Other);
	SDValue Ops[] = { Address, Base, Modifier, Offset, Chain };
	SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);

	MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
	MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
	cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);

	ReplaceUses(N, Result);
	CurDAG->RemoveDeadNode(N);
	}

	void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	SDNode *Result;
	switch (IID) {
	case Intrinsic::hexagon_V6_vaddcarry: {
	SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
	N->getOperand(3) };
	SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v512i1);
	Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops);
	break;
	}
	case Intrinsic::hexagon_V6_vaddcarry_128B: {
	SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
	N->getOperand(3) };
	SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v1024i1);
	Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops);
	break;
	}
	case Intrinsic::hexagon_V6_vsubcarry: {
	SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
	N->getOperand(3) };
	SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v512i1);
	Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops);
	break;
	}
	case Intrinsic::hexagon_V6_vsubcarry_128B: {
	SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
	N->getOperand(3) };
	SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v1024i1);
	Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops);
	break;
	}
	default:
	llvm_unreachable("Unexpected HVX dual output intrinsic.");
	}
	ReplaceUses(N, Result);
	ReplaceUses(SDValue(N, 0), SDValue(Result, 0));
	ReplaceUses(SDValue(N, 1), SDValue(Result, 1));
	CurDAG->RemoveDeadNode(N);
	}


	Index: vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCISelLowering.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCISelLowering.cpp (revision 328362)
	@@ -1,13878 +1,13921 @@
	//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the PPCISelLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "PPCISelLowering.h"
	#include "MCTargetDesc/PPCPredicates.h"
	#include "PPC.h"
	#include "PPCCCState.h"
	#include "PPCCallingConv.h"
	#include "PPCFrameLowering.h"
	#include "PPCInstrInfo.h"
	#include "PPCMachineFunctionInfo.h"
	#include "PPCPerfectShuffle.h"
	#include "PPCRegisterInfo.h"
	#include "PPCSubtarget.h"
	#include "PPCTargetMachine.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <list>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "ppc-lowering"

	static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
	cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);

	static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
	cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);

	static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
	cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);

	static cl::opt<bool> DisableSCO("disable-ppc-sco",
	cl::desc("disable sibling call optimization on ppc"), cl::Hidden);

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumSiblingCalls, "Number of sibling calls");

	static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);

	// FIXME: Remove this once the bug has been fixed!
	extern cl::opt<bool> ANDIGlueBug;

	PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
	const PPCSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	// Use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);

	// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
	// arguments are at least 4/8 bytes aligned.
	bool isPPC64 = Subtarget.isPPC64();
	setMinStackArgumentAlignment(isPPC64 ? 8:4);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
	if (!useSoftFloat()) {
	addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
	addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
	}

	// Match BITREVERSE to customized fast code sequence in the td file.
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);

	+ // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
	+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
	+
	// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
	}

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// PowerPC has pre-inc load and store's.
	setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
	setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);

	if (Subtarget.useCRBits()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	if (isPPC64 \|\| Subtarget.hasFPCVT()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
	AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
	isPPC64 ? MVT::i64 : MVT::i32);
	setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
	AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
	isPPC64 ? MVT::i64 : MVT::i32);
	} else {
	setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
	}

	// PowerPC does not support direct load/store of condition registers.
	setOperationAction(ISD::LOAD, MVT::i1, Custom);
	setOperationAction(ISD::STORE, MVT::i1, Custom);

	// FIXME: Remove this once the ANDI glue bug is fixed:
	if (ANDIGlueBug)
	setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);

	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
	setTruncStoreAction(VT, MVT::i1, Expand);
	}

	addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
	}

	// This is used in the ppcf128->int sequence. Note it has different semantics
	// from FP_ROUND: that rounds to nearest, this rounds to zero.
	setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);

	// We do not currently implement these libm ops for PowerPC.
	setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
	setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
	setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
	setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
	setOperationAction(ISD::FREM, MVT::ppcf128, Expand);

	// PowerPC has no SREM/UREM instructions unless we are on P9
	// On P9 we may use a hardware instruction to compute the remainder.
	// The instructions are not legalized directly because in the cases where the
	// result of both the remainder and the division is required it is more
	// efficient to compute the remainder from the result of the division rather
	// than use the remainder instruction.
	if (Subtarget.isISA3_0()) {
	setOperationAction(ISD::SREM, MVT::i32, Custom);
	setOperationAction(ISD::UREM, MVT::i32, Custom);
	setOperationAction(ISD::SREM, MVT::i64, Custom);
	setOperationAction(ISD::UREM, MVT::i64, Custom);
	} else {
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);
	}

	if (Subtarget.hasP9Vector()) {
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	}

	// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
	setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);

	// We don't support sin/cos/sqrt/fmod/pow
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FREM , MVT::f64, Expand);
	setOperationAction(ISD::FPOW , MVT::f64, Expand);
	setOperationAction(ISD::FMA , MVT::f64, Legal);
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	setOperationAction(ISD::FREM , MVT::f32, Expand);
	setOperationAction(ISD::FPOW , MVT::f32, Expand);
	setOperationAction(ISD::FMA , MVT::f32, Legal);

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	// If we're enabling GP optimizations, use hardware square root
	if (!Subtarget.hasFSQRT() &&
	!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
	Subtarget.hasFRE()))
	setOperationAction(ISD::FSQRT, MVT::f64, Expand);

	if (!Subtarget.hasFSQRT() &&
	!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
	Subtarget.hasFRES()))
	setOperationAction(ISD::FSQRT, MVT::f32, Expand);

	if (Subtarget.hasFCPSGN()) {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
	} else {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
	}

	if (Subtarget.hasFPRND()) {
	setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
	setOperationAction(ISD::FROUND, MVT::f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
	setOperationAction(ISD::FROUND, MVT::f32, Legal);
	}

	// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
	// to speed up scalar BSWAP64.
	// CTPOP or CTTZ were introduced in P8/P9 respectivelly
	setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
	if (Subtarget.isISA3_0()) {
	setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
	setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
	} else {
	setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
	setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
	setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
	}

	if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
	setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
	setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
	} else {
	setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
	setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
	}

	// PowerPC does not have ROTR
	setOperationAction(ISD::ROTR, MVT::i32 , Expand);
	setOperationAction(ISD::ROTR, MVT::i64 , Expand);

	if (!Subtarget.useCRBits()) {
	// PowerPC does not have Select
	setOperationAction(ISD::SELECT, MVT::i32, Expand);
	setOperationAction(ISD::SELECT, MVT::i64, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Expand);
	}

	// PowerPC wants to turn select_cc of FP into fsel when possible.
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);

	// PowerPC wants to optimize integer setcc a bit
	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SETCC, MVT::i32, Custom);

	// PowerPC does not have BRCOND which requires SetCC
	if (!Subtarget.useCRBits())
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);

	// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

	// PowerPC does not have [U\|S]INT_TO_FP
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);

	if (Subtarget.hasDirectMove() && isPPC64) {
	setOperationAction(ISD::BITCAST, MVT::f32, Legal);
	setOperationAction(ISD::BITCAST, MVT::i32, Legal);
	setOperationAction(ISD::BITCAST, MVT::i64, Legal);
	setOperationAction(ISD::BITCAST, MVT::f64, Legal);
	} else {
	setOperationAction(ISD::BITCAST, MVT::f32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i64, Expand);
	setOperationAction(ISD::BITCAST, MVT::f64, Expand);
	}

	// We cannot sextinreg(i1). Expand to shifts.
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
	// SjLj exception handling but a light-weight setjmp/longjmp replacement to
	// support continuation, user-level threading, and etc.. As a result, no
	// other SjLj exception interfaces are implemented and please don't build
	// your own exception handling based on them.
	// LLVM/Clang supports zero-cost DWARF exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

	// We want to legalize GlobalAddress and ConstantPool nodes into the
	// appropriate instructions to materialize the address.
	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
	setOperationAction(ISD::JumpTable, MVT::i32, Custom);
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	// TRAP is legal.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	// TRAMPOLINE is custom lowered.
	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);

	if (Subtarget.isSVR4ABI()) {
	if (isPPC64) {
	// VAARG always uses double-word chunks, so promote anything smaller.
	setOperationAction(ISD::VAARG, MVT::i1, Promote);
	AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i8, Promote);
	AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i16, Promote);
	AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::i32, Promote);
	AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	} else {
	// VAARG is custom lowered with the 32-bit SVR4 ABI.
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::i64, Custom);
	}
	} else
	setOperationAction(ISD::VAARG, MVT::Other, Expand);

	if (Subtarget.isSVR4ABI() && !isPPC64)
	// VACOPY is custom lowered with the 32-bit SVR4 ABI.
	setOperationAction(ISD::VACOPY , MVT::Other, Custom);
	else
	setOperationAction(ISD::VACOPY , MVT::Other, Expand);

	// Use the default implementation.
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	// To handle counter-based loop conditions.
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);

	setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

	// Comparisons that require checking two conditions.
	setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
	setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETONE, MVT::f64, Expand);

	if (Subtarget.has64BitSupport()) {
	// They also have instructions for converting between i64 and fp.
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
	// This is just the low 32 bits of a (signed) fp->i64 conversion.
	// We cannot do this with Promote because i64 is not a legal type.
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

	if (Subtarget.hasLFIWAX() \|\| Subtarget.isPPC64())
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	} else {
	// PowerPC does not have FP_TO_UINT on 32-bit implementations.
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
	}

	// With the instructions enabled under FPCVT, we can do everything.
	if (Subtarget.hasFPCVT()) {
	if (Subtarget.has64BitSupport()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	}

	if (Subtarget.use64BitRegs()) {
	// 64-bit PowerPC implementations can support i64 types directly
	addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
	// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
	// 64-bit PowerPC wants to expand i128 shifts itself.
	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
	} else {
	// 32-bit PowerPC wants to expand i64 shifts itself.
	setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
	}

	if (Subtarget.hasAltivec()) {
	// First set operation action for all vector types to expand. Then we
	// will selectively turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	// add/sub are legal for all supported vector VT's.
	setOperationAction(ISD::ADD, VT, Legal);
	setOperationAction(ISD::SUB, VT, Legal);

	// Vector instructions introduced in P8
	if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
	setOperationAction(ISD::CTPOP, VT, Legal);
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	else {
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	}

	// Vector instructions introduced in P9
	if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
	setOperationAction(ISD::CTTZ, VT, Legal);
	else
	setOperationAction(ISD::CTTZ, VT, Expand);

	// We promote all shuffles to v16i8.
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
	AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);

	// We promote all non-typed operations to v4i32.
	setOperationAction(ISD::AND , VT, Promote);
	AddPromotedToType (ISD::AND , VT, MVT::v4i32);
	setOperationAction(ISD::OR , VT, Promote);
	AddPromotedToType (ISD::OR , VT, MVT::v4i32);
	setOperationAction(ISD::XOR , VT, Promote);
	AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
	setOperationAction(ISD::LOAD , VT, Promote);
	AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
	setOperationAction(ISD::SELECT, VT, Promote);
	AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
	setOperationAction(ISD::SELECT_CC, VT, Promote);
	AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType (ISD::STORE, VT, MVT::v4i32);

	// No other operations are legal.
	setOperationAction(ISD::MUL , VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::FDIV, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FNEG, VT, Expand);
	setOperationAction(ISD::FSQRT, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FABS, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);

	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
	// with merges, splats, etc.
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);

	setOperationAction(ISD::AND , MVT::v4i32, Legal);
	setOperationAction(ISD::OR , MVT::v4i32, Legal);
	setOperationAction(ISD::XOR , MVT::v4i32, Legal);
	setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
	setOperationAction(ISD::SELECT, MVT::v4i32,
	Subtarget.useCRBits() ? Legal : Expand);
	setOperationAction(ISD::STORE , MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);

	addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);

	setOperationAction(ISD::MUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FMA, MVT::v4f32, Legal);

	if (TM.Options.UnsafeFPMath \|\| Subtarget.hasVSX()) {
	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	}

	if (Subtarget.hasP8Altivec())
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);
	else
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v16i8, Custom);

	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);

	setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

	// Altivec does not contain unordered floating-point compare instructions
	setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
	setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);

	if (Subtarget.hasVSX()) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
	if (Subtarget.hasP8Vector()) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
	}
	if (Subtarget.hasDirectMove() && isPPC64) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
	}
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v2f64, Legal);

	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::MUL, MVT::v2f64, Legal);
	setOperationAction(ISD::FMA, MVT::v2f64, Legal);

	setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);

	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
	setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
	setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
	setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);

	// Share the Altivec comparison restrictions.
	setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
	setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);

	setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
	setOperationAction(ISD::STORE, MVT::v2f64, Legal);

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);

	if (Subtarget.hasP8Vector())
	addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);

	addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);

	addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
	addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
	addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);

	if (Subtarget.hasP8Altivec()) {
	setOperationAction(ISD::SHL, MVT::v2i64, Legal);
	setOperationAction(ISD::SRA, MVT::v2i64, Legal);
	setOperationAction(ISD::SRL, MVT::v2i64, Legal);

	// 128 bit shifts can be accomplished via 3 instructions for SHL and
	// SRL, but not for SRA because of the instructions available:
	// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
	// doing
	setOperationAction(ISD::SHL, MVT::v1i128, Expand);
	setOperationAction(ISD::SRL, MVT::v1i128, Expand);
	setOperationAction(ISD::SRA, MVT::v1i128, Expand);

	setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
	}
	else {
	setOperationAction(ISD::SHL, MVT::v2i64, Expand);
	setOperationAction(ISD::SRA, MVT::v2i64, Expand);
	setOperationAction(ISD::SRL, MVT::v2i64, Expand);

	setOperationAction(ISD::SETCC, MVT::v2i64, Custom);

	// VSX v2i64 only supports non-arithmetic operations.
	setOperationAction(ISD::ADD, MVT::v2i64, Expand);
	setOperationAction(ISD::SUB, MVT::v2i64, Expand);
	}

	setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
	AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
	setOperationAction(ISD::STORE, MVT::v2i64, Promote);
	AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);

	// Vector operation legalization checks the result type of
	// SIGN_EXTEND_INREG, overall legalization checks the inner type.
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);

	setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
	setOperationAction(ISD::FABS, MVT::v4f32, Legal);
	setOperationAction(ISD::FABS, MVT::v2f64, Legal);

	if (Subtarget.hasDirectMove())
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);

	addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
	}

	if (Subtarget.hasP8Altivec()) {
	addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
	addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
	}

	if (Subtarget.hasP9Vector()) {
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	// 128 bit shifts can be accomplished via 3 instructions for SHL and
	// SRL, but not for SRA because of the instructions available:
	// VS{RL} and VS{RL}O.
	setOperationAction(ISD::SHL, MVT::v1i128, Legal);
	setOperationAction(ISD::SRL, MVT::v1i128, Legal);
	setOperationAction(ISD::SRA, MVT::v1i128, Expand);
	}

	if (Subtarget.hasP9Altivec()) {
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}
	}

	if (Subtarget.hasQPX()) {
	setOperationAction(ISD::FADD, MVT::v4f64, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
	setOperationAction(ISD::FREM, MVT::v4f64, Expand);

	setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
	setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);

	setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
	setOperationAction(ISD::STORE , MVT::v4f64, Custom);

	setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
	setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);

	setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

	setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
	setOperationAction(ISD::FABS , MVT::v4f64, Legal);
	setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
	setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
	setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
	setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
	setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
	setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);

	setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);

	setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);

	addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);

	setOperationAction(ISD::FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FREM, MVT::v4f32, Expand);

	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
	setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);

	setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
	setOperationAction(ISD::STORE , MVT::v4f32, Custom);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
	setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);

	setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
	setOperationAction(ISD::FABS , MVT::v4f32, Legal);
	setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
	setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
	setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);

	setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);

	setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
	setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);

	addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);

	setOperationAction(ISD::AND , MVT::v4i1, Legal);
	setOperationAction(ISD::OR , MVT::v4i1, Legal);
	setOperationAction(ISD::XOR , MVT::v4i1, Legal);

	if (!Subtarget.useCRBits())
	setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
	setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);

	setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
	setOperationAction(ISD::STORE , MVT::v4i1, Custom);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
	setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);

	addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);

	setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f64, Legal);

	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);

	// These need to set FE_INEXACT, and so cannot be vectorized here.
	setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f32, Expand);

	if (TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);

	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	} else {
	setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);

	setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
	}
	}

	if (Subtarget.has64BitSupport())
	setOperationAction(ISD::PREFETCH, MVT::Other, Legal);

	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);

	if (!isPPC64) {
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
	}

	setBooleanContents(ZeroOrOneBooleanContent);

	if (Subtarget.hasAltivec()) {
	// Altivec instructions set fields to all zeros or all ones.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
	}

	if (!isPPC64) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::BUILD_VECTOR);
	if (Subtarget.hasFPCVT())
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::BR_CC);
	if (Subtarget.useCRBits())
	setTargetDAGCombine(ISD::BRCOND);
	setTargetDAGCombine(ISD::BSWAP);
	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_VOID);

	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);

	if (Subtarget.useCRBits()) {
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::SELECT_CC);
	}

	// Use reciprocal estimates.
	if (TM.Options.UnsafeFPMath) {
	setTargetDAGCombine(ISD::FDIV);
	setTargetDAGCombine(ISD::FSQRT);
	}

	// Darwin long double math library functions have $LDBL128 appended.
	if (Subtarget.isDarwin()) {
	setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
	setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
	setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
	setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
	setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
	setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
	setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
	setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
	setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
	setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
	}

	// With 32 condition bits, we don't need to sink (and duplicate) compares
	// aggressively in CodeGenPrep.
	if (Subtarget.useCRBits()) {
	setHasMultipleConditionRegisters();
	setJumpIsExpensive();
	}

	setMinFunctionAlignment(2);
	if (Subtarget.isDarwin())
	setPrefFunctionAlignment(4);

	switch (Subtarget.getDarwinDirective()) {
	default: break;
	case PPC::DIR_970:
	case PPC::DIR_A2:
	case PPC::DIR_E500mc:
	case PPC::DIR_E5500:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	case PPC::DIR_PWR9:
	setPrefFunctionAlignment(4);
	setPrefLoopAlignment(4);
	break;
	}

	if (Subtarget.enableMachineScheduler())
	setSchedulingPreference(Sched::Source);
	else
	setSchedulingPreference(Sched::Hybrid);

	computeRegisterProperties(STI.getRegisterInfo());

	// The Freescale cores do better with aggressive inlining of memcpy and
	// friends. GCC uses same threshold of 128 bytes (= 32 word stores).
	if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc \|\|
	Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
	MaxStoresPerMemset = 32;
	MaxStoresPerMemsetOptSize = 16;
	MaxStoresPerMemcpy = 32;
	MaxStoresPerMemcpyOptSize = 8;
	MaxStoresPerMemmove = 32;
	MaxStoresPerMemmoveOptSize = 8;
	} else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) {
	// The A2 also benefits from (very) aggressive inlining of memcpy and
	// friends. The overhead of a the function call, even when warm, can be
	// over one hundred cycles.
	MaxStoresPerMemset = 128;
	MaxStoresPerMemcpy = 128;
	MaxStoresPerMemmove = 128;
	MaxLoadsPerMemcmp = 128;
	} else {
	MaxLoadsPerMemcmp = 8;
	MaxLoadsPerMemcmpOptSize = 4;
	}
	}

	/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
	unsigned MaxMaxAlign) {
	if (MaxAlign == MaxMaxAlign)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
	MaxAlign = 32;
	else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == MaxMaxAlign)
	break;
	}
	}
	}

	/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area.
	unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	// Darwin passes everything on 4 byte boundary.
	if (Subtarget.isDarwin())
	return 4;

	// 16byte and wider vectors are passed on 16byte boundary.
	// The rest is 8 on PPC64 and 4 on PPC32 boundary.
	unsigned Align = Subtarget.isPPC64() ? 8 : 4;
	if (Subtarget.hasAltivec() \|\| Subtarget.hasQPX())
	getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
	return Align;
	}

	bool PPCTargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((PPCISD::NodeType)Opcode) {
	case PPCISD::FIRST_NUMBER: break;
	case PPCISD::FSEL: return "PPCISD::FSEL";
	case PPCISD::FCFID: return "PPCISD::FCFID";
	case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
	case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
	case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
	case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
	case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
	case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
	case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
	case PPCISD::FRE: return "PPCISD::FRE";
	case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
	case PPCISD::STFIWX: return "PPCISD::STFIWX";
	case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
	case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
	case PPCISD::VPERM: return "PPCISD::VPERM";
	case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
	case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
	case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE";
	case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
	case PPCISD::VECSHL: return "PPCISD::VECSHL";
	case PPCISD::CMPB: return "PPCISD::CMPB";
	case PPCISD::Hi: return "PPCISD::Hi";
	case PPCISD::Lo: return "PPCISD::Lo";
	case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
	+ case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
	+ case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
	case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
	case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
	case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
	case PPCISD::SRL: return "PPCISD::SRL";
	case PPCISD::SRA: return "PPCISD::SRA";
	case PPCISD::SHL: return "PPCISD::SHL";
	case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
	case PPCISD::CALL: return "PPCISD::CALL";
	case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
	case PPCISD::MTCTR: return "PPCISD::MTCTR";
	case PPCISD::BCTRL: return "PPCISD::BCTRL";
	case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
	case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
	case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
	case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
	case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
	case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
	case PPCISD::MFVSR: return "PPCISD::MFVSR";
	case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
	case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
	case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
	case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
	case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT";
	case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT";
	case PPCISD::VCMP: return "PPCISD::VCMP";
	case PPCISD::VCMPo: return "PPCISD::VCMPo";
	case PPCISD::LBRX: return "PPCISD::LBRX";
	case PPCISD::STBRX: return "PPCISD::STBRX";
	case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
	case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
	case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
	case PPCISD::STXSIX: return "PPCISD::STXSIX";
	case PPCISD::VEXTS: return "PPCISD::VEXTS";
	case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
	case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
	case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
	case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
	case PPCISD::BDNZ: return "PPCISD::BDNZ";
	case PPCISD::BDZ: return "PPCISD::BDZ";
	case PPCISD::MFFS: return "PPCISD::MFFS";
	case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
	case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
	case PPCISD::CR6SET: return "PPCISD::CR6SET";
	case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
	case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
	case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
	case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
	case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
	case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
	case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
	case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
	case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
	case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
	case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
	case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
	case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
	case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
	case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
	case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
	case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
	case PPCISD::SC: return "PPCISD::SC";
	case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
	case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
	case PPCISD::RFEBB: return "PPCISD::RFEBB";
	case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
	case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
	case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
	case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
	case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
	case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
	case PPCISD::QBFLT: return "PPCISD::QBFLT";
	case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
	}
	return nullptr;
	}

	EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
	EVT VT) const {
	if (!VT.isVector())
	return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;

	if (Subtarget.hasQPX())
	return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());

	return VT.changeVectorElementTypeToInteger();
	}

	bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Node matching predicates, for use by the tblgen matching code.
	//===----------------------------------------------------------------------===//

	/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
	static bool isFloatingPointZero(SDValue Op) {
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
	return CFP->getValueAPF().isZero();
	else if (ISD::isEXTLoad(Op.getNode()) \|\| ISD::isNON_EXTLoad(Op.getNode())) {
	// Maybe this has already been legalized into the constant pool?
	if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
	return CFP->getValueAPF().isZero();
	}
	return false;
	}

	/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
	/// true if Op is undef or if it matches the specified value.
	static bool isConstantOrUndef(int Op, int Val) {
	return Op < 0 \|\| Op == Val;
	}

	/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUHUM instruction.
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 1;
	for (unsigned i = 0; i != 8; ++i)
	if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
	return false;
	}
	return true;
	}

	/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUWUM instruction.
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 2;
	for (unsigned i = 0; i != 8; i += 2)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
	return false;
	}
	return true;
	}

	/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
	/// current subtarget.
	///
	/// The ShuffleKind distinguishes between big-endian operations with
	/// two different inputs (0), either-endian operations with two identical
	/// inputs (1), and little-endian operations with two different inputs (2).
	/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	const PPCSubtarget& Subtarget =
	static_cast<const PPCSubtarget&>(DAG.getSubtarget());
	if (!Subtarget.hasP8Vector())
	return false;

	bool IsLE = DAG.getDataLayout().isLittleEndian();
	if (ShuffleKind == 0) {
	if (IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
	return false;
	} else if (ShuffleKind == 2) {
	if (!IsLE)
	return false;
	for (unsigned i = 0; i != 16; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
	return false;
	} else if (ShuffleKind == 1) {
	unsigned j = IsLE ? 0 : 4;
	for (unsigned i = 0; i != 8; i += 4)
	if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) \|\|
	!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) \|\|
	!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) \|\|
	!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) \|\|
	!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
	return false;
	}
	return true;
	}

	/// isVMerge - Common function, used to match vmrg* shuffles.
	///
	static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned LHSStart, unsigned RHSStart) {
	if (N->getValueType(0) != MVT::v16i8)
	return false;
	assert((UnitSize == 1 \|\| UnitSize == 2 \|\| UnitSize == 4) &&
	"Unsupported merge size!");

	for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
	for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
	if (!isConstantOrUndef(N->getMaskElt(iUnitSize2+j),
	LHSStart+j+i*UnitSize) \|\|
	!isConstantOrUndef(N->getMaskElt(iUnitSize2+UnitSize+j),
	RHSStart+j+i*UnitSize))
	return false;
	}
	return true;
	}

	/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
	/// The ShuffleKind distinguishes between big-endian merges with two
	/// different inputs (0), either-endian merges with two identical inputs (1),
	/// and little-endian merges with two different inputs (2). For the latter,
	/// the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 0, 0);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, UnitSize, 0, 16);
	else
	return false;
	} else {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 8, 8);
	else if (ShuffleKind == 0) // normal
	return isVMerge(N, UnitSize, 8, 24);
	else
	return false;
	}
	}

	/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
	/// The ShuffleKind distinguishes between big-endian merges with two
	/// different inputs (0), either-endian merges with two identical inputs (1),
	/// and little-endian merges with two different inputs (2). For the latter,
	/// the input operands are swapped (see PPCInstrAltivec.td).
	bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 8, 8);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, UnitSize, 8, 24);
	else
	return false;
	} else {
	if (ShuffleKind == 1) // unary
	return isVMerge(N, UnitSize, 0, 0);
	else if (ShuffleKind == 0) // normal
	return isVMerge(N, UnitSize, 0, 16);
	else
	return false;
	}
	}

	/**
	* \brief Common function used to match vmrgew and vmrgow shuffles
	*
	* The indexOffset determines whether to look for even or odd words in
	* the shuffle mask. This is based on the of the endianness of the target
	* machine.
	* - Little Endian:
	* - Use offset of 0 to check for odd elements
	* - Use offset of 4 to check for even elements
	* - Big Endian:
	* - Use offset of 0 to check for even elements
	* - Use offset of 4 to check for odd elements
	* A detailed description of the vector element ordering for little endian and
	* big endian can be found at
	* http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
	* Targeting your applications - what little endian and big endian IBM XL C/C++
	* compiler differences mean to you
	*
	* The mask to the shuffle vector instruction specifies the indices of the
	* elements from the two input vectors to place in the result. The elements are
	* numbered in array-access order, starting with the first vector. These vectors
	* are always of type v16i8, thus each vector will contain 16 elements of size
	* 8. More info on the shuffle vector can be found in the
	* http://llvm.org/docs/LangRef.html#shufflevector-instruction
	* Language Reference.
	*
	* The RHSStartValue indicates whether the same input vectors are used (unary)
	* or two different input vectors are used, based on the following:
	* - If the instruction uses the same vector for both inputs, the range of the
	* indices will be 0 to 15. In this case, the RHSStart value passed should
	* be 0.
	* - If the instruction has two different vectors then the range of the
	* indices will be 0 to 31. In this case, the RHSStart value passed should
	* be 16 (indices 0-15 specify elements in the first vector while indices 16
	* to 31 specify elements in the second vector).
	*
	* \param[in] N The shuffle vector SD Node to analyze
	* \param[in] IndexOffset Specifies whether to look for even or odd elements
	* \param[in] RHSStartValue Specifies the starting index for the righthand input
	* vector to the shuffle_vector instruction
	* \return true iff this shuffle vector represents an even or odd word merge
	*/
	static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
	unsigned RHSStartValue) {
	if (N->getValueType(0) != MVT::v16i8)
	return false;

	for (unsigned i = 0; i < 2; ++i)
	for (unsigned j = 0; j < 4; ++j)
	if (!isConstantOrUndef(N->getMaskElt(i*4+j),
	i*RHSStartValue+j+IndexOffset) \|\|
	!isConstantOrUndef(N->getMaskElt(i*4+j+8),
	i*RHSStartValue+j+IndexOffset+8))
	return false;
	return true;
	}

	/**
	* \brief Determine if the specified shuffle mask is suitable for the vmrgew or
	* vmrgow instructions.
	*
	* \param[in] N The shuffle vector SD Node to analyze
	* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
	* \param[in] ShuffleKind Identify the type of merge:
	* - 0 = big-endian merge with two different inputs;
	* - 1 = either-endian merge with two identical inputs;
	* - 2 = little-endian merge with two different inputs (inputs are swapped for
	* little-endian merges).
	* \param[in] DAG The current SelectionDAG
	* \return true iff this shuffle mask
	*/
	bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
	unsigned ShuffleKind, SelectionDAG &DAG) {
	if (DAG.getDataLayout().isLittleEndian()) {
	unsigned indexOffset = CheckEven ? 4 : 0;
	if (ShuffleKind == 1) // Unary
	return isVMerge(N, indexOffset, 0);
	else if (ShuffleKind == 2) // swapped
	return isVMerge(N, indexOffset, 16);
	else
	return false;
	}
	else {
	unsigned indexOffset = CheckEven ? 0 : 4;
	if (ShuffleKind == 1) // Unary
	return isVMerge(N, indexOffset, 0);
	else if (ShuffleKind == 0) // Normal
	return isVMerge(N, indexOffset, 16);
	else
	return false;
	}
	return false;
	}

	/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
	/// amount, otherwise return -1.
	/// The ShuffleKind distinguishes between big-endian operations with two
	/// different inputs (0), either-endian operations with two identical inputs
	/// (1), and little-endian operations with two different inputs (2). For the
	/// latter, the input operands are swapped (see PPCInstrAltivec.td).
	int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG) {
	if (N->getValueType(0) != MVT::v16i8)
	return -1;

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

	// Find the first non-undef value in the shuffle mask.
	unsigned i;
	for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
	/search/;

	if (i == 16) return -1; // all undef.

	// Otherwise, check to see if the rest of the elements are consecutively
	// numbered from this value.
	unsigned ShiftAmt = SVOp->getMaskElt(i);
	if (ShiftAmt < i) return -1;

	ShiftAmt -= i;
	bool isLE = DAG.getDataLayout().isLittleEndian();

	if ((ShuffleKind == 0 && !isLE) \|\| (ShuffleKind == 2 && isLE)) {
	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 16; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
	return -1;
	} else if (ShuffleKind == 1) {
	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 16; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
	return -1;
	} else
	return -1;

	if (isLE)
	ShiftAmt = 16 - ShiftAmt;

	return ShiftAmt;
	}

	/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
	/// specifies a splat of a single element that is suitable for input to
	/// VSPLTB/VSPLTH/VSPLTW.
	bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
	assert(N->getValueType(0) == MVT::v16i8 &&
	(EltSize == 1 \|\| EltSize == 2 \|\| EltSize == 4));

	// The consecutive indices need to specify an element, not part of two
	// different elements. So abandon ship early if this isn't the case.
	if (N->getMaskElt(0) % EltSize != 0)
	return false;

	// This is a splat operation if each element of the permute is the same, and
	// if the value doesn't reference the second vector.
	unsigned ElementBase = N->getMaskElt(0);

	// FIXME: Handle UNDEF elements too!
	if (ElementBase >= 16)
	return false;

	// Check that the indices are consecutive, in the case of a multi-byte element
	// splatted with a v16i8 mask.
	for (unsigned i = 1; i != EltSize; ++i)
	if (N->getMaskElt(i) < 0 \|\| N->getMaskElt(i) != (int)(i+ElementBase))
	return false;

	for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
	if (N->getMaskElt(i) < 0) continue;
	for (unsigned j = 0; j != EltSize; ++j)
	if (N->getMaskElt(i+j) != N->getMaskElt(j))
	return false;
	}
	return true;
	}

	/// Check that the mask is shuffling N byte elements. Within each N byte
	/// element of the mask, the indices could be either in increasing or
	/// decreasing order as long as they are consecutive.
	/// \param[in] N the shuffle vector SD Node to analyze
	/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
	/// Word/DoubleWord/QuadWord).
	/// \param[in] StepLen the delta indices number among the N byte element, if
	/// the mask is in increasing/decreasing order then it is 1/-1.
	/// \return true iff the mask is shuffling N byte elements.
	static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
	int StepLen) {
	assert((Width == 2 \|\| Width == 4 \|\| Width == 8 \|\| Width == 16) &&
	"Unexpected element width.");
	assert((StepLen == 1 \|\| StepLen == -1) && "Unexpected element width.");

	unsigned NumOfElem = 16 / Width;
	unsigned MaskVal[16]; // Width is never greater than 16
	for (unsigned i = 0; i < NumOfElem; ++i) {
	MaskVal[0] = N->getMaskElt(i * Width);
	if ((StepLen == 1) && (MaskVal[0] % Width)) {
	return false;
	} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
	return false;
	}

	for (unsigned int j = 1; j < Width; ++j) {
	MaskVal[j] = N->getMaskElt(i * Width + j);
	if (MaskVal[j] != MaskVal[j-1] + StepLen) {
	return false;
	}
	}
	}

	return true;
	}

	bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	unsigned &InsertAtByte, bool &Swap, bool IsLE) {
	if (!isNByteElemShuffleMask(N, 4, 1))
	return false;

	// Now we look at mask elements 0,4,8,12
	unsigned M0 = N->getMaskElt(0) / 4;
	unsigned M1 = N->getMaskElt(4) / 4;
	unsigned M2 = N->getMaskElt(8) / 4;
	unsigned M3 = N->getMaskElt(12) / 4;
	unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
	unsigned BigEndianShifts[] = { 3, 0, 1, 2 };

	// Below, let H and L be arbitrary elements of the shuffle mask
	// where H is in the range [4,7] and L is in the range [0,3].
	// H, 1, 2, 3 or L, 5, 6, 7
	if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) \|\|
	(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
	InsertAtByte = IsLE ? 12 : 0;
	Swap = M0 < 4;
	return true;
	}
	// 0, H, 2, 3 or 4, L, 6, 7
	if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) \|\|
	(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
	InsertAtByte = IsLE ? 8 : 4;
	Swap = M1 < 4;
	return true;
	}
	// 0, 1, H, 3 or 4, 5, L, 7
	if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) \|\|
	(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
	ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
	InsertAtByte = IsLE ? 4 : 8;
	Swap = M2 < 4;
	return true;
	}
	// 0, 1, 2, H or 4, 5, 6, L
	if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) \|\|
	(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
	ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
	InsertAtByte = IsLE ? 0 : 12;
	Swap = M3 < 4;
	return true;
	}

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	ShiftElts = 0;
	Swap = true;
	unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
	if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
	InsertAtByte = IsLE ? 12 : 0;
	return true;
	}
	if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
	InsertAtByte = IsLE ? 8 : 4;
	return true;
	}
	if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
	InsertAtByte = IsLE ? 4 : 8;
	return true;
	}
	if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
	InsertAtByte = IsLE ? 0 : 12;
	return true;
	}
	}

	return false;
	}

	bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
	// Ensure each byte index of the word is consecutive.
	if (!isNByteElemShuffleMask(N, 4, 1))
	return false;

	// Now we look at mask elements 0,4,8,12, which are the beginning of words.
	unsigned M0 = N->getMaskElt(0) / 4;
	unsigned M1 = N->getMaskElt(4) / 4;
	unsigned M2 = N->getMaskElt(8) / 4;
	unsigned M3 = N->getMaskElt(12) / 4;

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	assert(M0 < 4 && "Indexing into an undef vector?");
	if (M1 != (M0 + 1) % 4 \|\| M2 != (M1 + 1) % 4 \|\| M3 != (M2 + 1) % 4)
	return false;

	ShiftElts = IsLE ? (4 - M0) % 4 : M0;
	Swap = false;
	return true;
	}

	// Ensure each word index of the ShuffleVector Mask is consecutive.
	if (M1 != (M0 + 1) % 8 \|\| M2 != (M1 + 1) % 8 \|\| M3 != (M2 + 1) % 8)
	return false;

	if (IsLE) {
	if (M0 == 0 \|\| M0 == 7 \|\| M0 == 6 \|\| M0 == 5) {
	// Input vectors don't need to be swapped if the leading element
	// of the result is one of the 3 left elements of the second vector
	// (or if there is no shift to be done at all).
	Swap = false;
	ShiftElts = (8 - M0) % 8;
	} else if (M0 == 4 \|\| M0 == 3 \|\| M0 == 2 \|\| M0 == 1) {
	// Input vectors need to be swapped if the leading element
	// of the result is one of the 3 left elements of the first vector
	// (or if we're shifting by 4 - thereby simply swapping the vectors).
	Swap = true;
	ShiftElts = (4 - M0) % 4;
	}

	return true;
	} else { // BE
	if (M0 == 0 \|\| M0 == 1 \|\| M0 == 2 \|\| M0 == 3) {
	// Input vectors don't need to be swapped if the leading element
	// of the result is one of the 4 elements of the first vector.
	Swap = false;
	ShiftElts = M0;
	} else if (M0 == 4 \|\| M0 == 5 \|\| M0 == 6 \|\| M0 == 7) {
	// Input vectors need to be swapped if the leading element
	// of the result is one of the 4 elements of the right vector.
	Swap = true;
	ShiftElts = M0 - 4;
	}

	return true;
	}
	}

	bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");

	if (!isNByteElemShuffleMask(N, Width, -1))
	return false;

	for (int i = 0; i < 16; i += Width)
	if (N->getMaskElt(i) != i + Width - 1)
	return false;

	return true;
	}

	bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 2);
	}

	bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 4);
	}

	bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 8);
	}

	bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
	return isXXBRShuffleMaskHelper(N, 16);
	}

	/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
	/// if the inputs to the instruction should be swapped and set \p DM to the
	/// value for the immediate.
	/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
	/// AND element 0 of the result comes from the first input (LE) or second input
	/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
	/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
	/// mask.
	bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
	bool &Swap, bool IsLE) {
	assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");

	// Ensure each byte index of the double word is consecutive.
	if (!isNByteElemShuffleMask(N, 8, 1))
	return false;

	unsigned M0 = N->getMaskElt(0) / 8;
	unsigned M1 = N->getMaskElt(8) / 8;
	assert(((M0 \| M1) < 4) && "A mask element out of bounds?");

	// If both vector operands for the shuffle are the same vector, the mask will
	// contain only elements from the first one and the second one will be undef.
	if (N->getOperand(1).isUndef()) {
	if ((M0 \| M1) < 2) {
	DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
	Swap = false;
	return true;
	} else
	return false;
	}

	if (IsLE) {
	if (M0 > 1 && M1 < 2) {
	Swap = false;
	} else if (M0 < 2 && M1 > 1) {
	M0 = (M0 + 2) % 4;
	M1 = (M1 + 2) % 4;
	Swap = true;
	} else
	return false;

	// Note: if control flow comes here that means Swap is already set above
	DM = (((~M1) & 1) << 1) + ((~M0) & 1);
	return true;
	} else { // BE
	if (M0 < 2 && M1 > 1) {
	Swap = false;
	} else if (M0 > 1 && M1 < 2) {
	M0 = (M0 + 2) % 4;
	M1 = (M1 + 2) % 4;
	Swap = true;
	} else
	return false;

	// Note: if control flow comes here that means Swap is already set above
	DM = (M0 << 1) + (M1 & 1);
	return true;
	}
	}


	/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
	/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
	unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	assert(isSplatShuffleMask(SVOp, EltSize));
	if (DAG.getDataLayout().isLittleEndian())
	return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
	else
	return SVOp->getMaskElt(0) / EltSize;
	}

	/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
	/// by using a vspltis[bhw] instruction of the specified element size, return
	/// the constant being splatted. The ByteSize field indicates the number of
	/// bytes of each element [124] -> [bhw].
	SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
	SDValue OpVal(nullptr, 0);

	// If ByteSize of the splat is bigger than the element size of the
	// build_vector, then we have a case where we are checking for a splat where
	// multiple elements of the buildvector are folded together into a single
	// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
	unsigned EltSize = 16/N->getNumOperands();
	if (EltSize < ByteSize) {
	unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
	SDValue UniquedVals[4];
	assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");

	// See if all of the elements in the buildvector agree across.
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	if (N->getOperand(i).isUndef()) continue;
	// If the element isn't a constant, bail fully out.
	if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();

	if (!UniquedVals[i&(Multiple-1)].getNode())
	UniquedVals[i&(Multiple-1)] = N->getOperand(i);
	else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
	return SDValue(); // no match.
	}

	// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
	// either constant or undef values that are identical for each chunk. See
	// if these chunks can form into a larger vspltis*.

	// Check to see if all of the leading entries are either 0 or -1. If
	// neither, then this won't fit into the immediate field.
	bool LeadingZero = true;
	bool LeadingOnes = true;
	for (unsigned i = 0; i != Multiple-1; ++i) {
	if (!UniquedVals[i].getNode()) continue; // Must have been undefs.

	LeadingZero &= isNullConstant(UniquedVals[i]);
	LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
	}
	// Finally, check the least significant entry.
	if (LeadingZero) {
	if (!UniquedVals[Multiple-1].getNode())
	return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
	int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
	if (Val < 16) // 0,0,0,4 -> vspltisw(4)
	return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
	}
	if (LeadingOnes) {
	if (!UniquedVals[Multiple-1].getNode())
	return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
	int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
	if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
	return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
	}

	return SDValue();
	}

	// Check to see if this buildvec has a single non-undef value in its elements.
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	if (N->getOperand(i).isUndef()) continue;
	if (!OpVal.getNode())
	OpVal = N->getOperand(i);
	else if (OpVal != N->getOperand(i))
	return SDValue();
	}

	if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.

	unsigned ValSizeInBytes = EltSize;
	uint64_t Value = 0;
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
	Value = CN->getZExtValue();
	} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
	assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
	Value = FloatToBits(CN->getValueAPF().convertToFloat());
	}

	// If the splat value is larger than the element value, then we can never do
	// this splat. The only case that we could fit the replicated bits into our
	// immediate field for would be zero, and we prefer to use vxor for it.
	if (ValSizeInBytes < ByteSize) return SDValue();

	// If the element value is larger than the splat value, check if it consists
	// of a repeated bit pattern of size ByteSize.
	if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
	return SDValue();

	// Properly sign extend the value.
	int MaskVal = SignExtend32(Value, ByteSize * 8);

	// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
	if (MaskVal == 0) return SDValue();

	// Finally, if this value fits in a 5 bit sext field, return it
	if (SignExtend32<5>(MaskVal) == MaskVal)
	return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
	return SDValue();
	}

	/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
	/// amount, otherwise return -1.
	int PPC::isQVALIGNIShuffleMask(SDNode *N) {
	EVT VT = N->getValueType(0);
	if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
	return -1;

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

	// Find the first non-undef value in the shuffle mask.
	unsigned i;
	for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
	/search/;

	if (i == 4) return -1; // all undef.

	// Otherwise, check to see if the rest of the elements are consecutively
	// numbered from this value.
	unsigned ShiftAmt = SVOp->getMaskElt(i);
	if (ShiftAmt < i) return -1;
	ShiftAmt -= i;

	// Check the rest of the elements to see if they are consecutive.
	for (++i; i != 4; ++i)
	if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
	return -1;

	return ShiftAmt;
	}

	//===----------------------------------------------------------------------===//
	// Addressing Mode Selection
	//===----------------------------------------------------------------------===//

	/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
	/// or 64-bit immediate, and if the value can be accurately represented as a
	/// sign extension from a 16-bit value. If so, this returns true and the
	/// immediate.
	bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
	if (!isa<ConstantSDNode>(N))
	return false;

	Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
	if (N->getValueType(0) == MVT::i32)
	return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
	else
	return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
	}
	bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
	return isIntS16Immediate(Op.getNode(), Imm);
	}

	/// SelectAddressRegReg - Given the specified addressed, check to see if it
	/// can be represented as an indexed [r+r] operation. Returns false if it
	/// can be more efficiently represented with [r+imm].
	bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
	SDValue &Index,
	SelectionDAG &DAG) const {
	int16_t imm = 0;
	if (N.getOpcode() == ISD::ADD) {
	if (isIntS16Immediate(N.getOperand(1), imm))
	return false; // r+i
	if (N.getOperand(1).getOpcode() == PPCISD::Lo)
	return false; // r+i

	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	} else if (N.getOpcode() == ISD::OR) {
	if (isIntS16Immediate(N.getOperand(1), imm))
	return false; // r+i can fold it if we can.

	// If this is an or of disjoint bitfields, we can codegen this as an add
	// (for better address arithmetic) if the LHS and RHS of the OR are provably
	// disjoint.
	KnownBits LHSKnown, RHSKnown;
	DAG.computeKnownBits(N.getOperand(0), LHSKnown);

	if (LHSKnown.Zero.getBoolValue()) {
	DAG.computeKnownBits(N.getOperand(1), RHSKnown);
	// If all of the bits are known zero on the LHS or RHS, the add won't
	// carry.
	if (~(LHSKnown.Zero \| RHSKnown.Zero) == 0) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}
	}
	}

	return false;
	}

	// If we happen to be doing an i64 load or store into a stack slot that has
	// less than a 4-byte alignment, then the frame-index elimination may need to
	// use an indexed load or store instruction (because the offset may not be a
	// multiple of 4). The extra register needed to hold the offset comes from the
	// register scavenger, and it is possible that the scavenger will need to use
	// an emergency spill slot. As a result, we need to make sure that a spill slot
	// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
	// stack slot.
	static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
	// FIXME: This does not handle the LWA case.
	if (VT != MVT::i64)
	return;

	// NOTE: We'll exclude negative FIs here, which come from argument
	// lowering, because there are no known test cases triggering this problem
	// using packed structures (or similar). We can remove this exclusion if
	// we find such a test case. The reason why this is so test-case driven is
	// because this entire 'fixup' is only to prevent crashes (from the
	// register scavenger) on not-really-valid inputs. For example, if we have:
	// %a = alloca i1
	// %b = bitcast i1* %a to i64*
	// store i64* a, i64 b
	// then the store should really be marked as 'align 1', but is not. If it
	// were marked as 'align 1' then the indexed form would have been
	// instruction-selected initially, and the problem this 'fixup' is preventing
	// won't happen regardless.
	if (FrameIdx < 0)
	return;

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	unsigned Align = MFI.getObjectAlignment(FrameIdx);
	if (Align >= 4)
	return;

	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setHasNonRISpills();
	}

	/// Returns true if the address N can be represented by a base register plus
	/// a signed 16-bit displacement [r+imm], and if it is not better
	/// represented as reg+reg. If \p Alignment is non-zero, only accept
	/// displacements that are multiples of that value.
	bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
	SDValue &Base,
	SelectionDAG &DAG,
	unsigned Alignment) const {
	// FIXME dl should come from parent load or store, not from address
	SDLoc dl(N);
	// If this can be more profitably realized as r+r, fail.
	if (SelectAddressRegReg(N, Disp, Base, DAG))
	return false;

	if (N.getOpcode() == ISD::ADD) {
	int16_t imm = 0;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!Alignment \|\| (imm % Alignment) == 0)) {
	Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else {
	Base = N.getOperand(0);
	}
	return true; // [r+i]
	} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
	// Match LOAD (ADD (X, Lo(G))).
	assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
	&& "Cannot handle constant offsets yet!");
	Disp = N.getOperand(1).getOperand(0); // The global address.
	assert(Disp.getOpcode() == ISD::TargetGlobalAddress \|\|
	Disp.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
	Disp.getOpcode() == ISD::TargetConstantPool \|\|
	Disp.getOpcode() == ISD::TargetJumpTable);
	Base = N.getOperand(0);
	return true; // [&g+r]
	}
	} else if (N.getOpcode() == ISD::OR) {
	int16_t imm = 0;
	if (isIntS16Immediate(N.getOperand(1), imm) &&
	(!Alignment \|\| (imm % Alignment) == 0)) {
	// If this is an or of disjoint bitfields, we can codegen this as an add
	// (for better address arithmetic) if the LHS and RHS of the OR are
	// provably disjoint.
	KnownBits LHSKnown;
	DAG.computeKnownBits(N.getOperand(0), LHSKnown);

	if ((LHSKnown.Zero.getZExtValue()\|~(uint64_t)imm) == ~0ULL) {
	// If all of the bits are known zero on the LHS or RHS, the add won't
	// carry.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else {
	Base = N.getOperand(0);
	}
	Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
	return true;
	}
	}
	} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
	// Loading from a constant address.

	// If this address fits entirely in a 16-bit sext immediate field, codegen
	// this as "d, 0"
	int16_t Imm;
	if (isIntS16Immediate(CN, Imm) && (!Alignment \|\| (Imm % Alignment) == 0)) {
	Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
	Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
	CN->getValueType(0));
	return true;
	}

	// Handle 32-bit sext immediates with LIS + addr mode.
	if ((CN->getValueType(0) == MVT::i32 \|\|
	(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
	(!Alignment \|\| (CN->getZExtValue() % Alignment) == 0)) {
	int Addr = (int)CN->getZExtValue();

	// Otherwise, break this down into an LIS + disp.
	Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);

	Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
	MVT::i32);
	unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
	Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
	return true;
	}
	}

	Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
	Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
	fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
	} else
	Base = N;
	return true; // [r+0]
	}

	/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
	/// represented as an indexed [r+r] operation.
	bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
	SDValue &Index,
	SelectionDAG &DAG) const {
	// Check to see if we can easily represent this as an [r+r] address. This
	// will fail if it thinks that the address is more profitably represented as
	// reg+imm, e.g. where imm = 0.
	if (SelectAddressRegReg(N, Base, Index, DAG))
	return true;

	// If the address is the result of an add, we will utilize the fact that the
	// address calculation includes an implicit add. However, we can reduce
	// register pressure if we do not materialize a constant just for use as the
	// index register. We only get rid of the add if it is not an add of a
	// value and a 16-bit signed constant and both have a single use.
	int16_t imm = 0;
	if (N.getOpcode() == ISD::ADD &&
	(!isIntS16Immediate(N.getOperand(1), imm) \|\|
	!N.getOperand(1).hasOneUse() \|\| !N.getOperand(0).hasOneUse())) {
	Base = N.getOperand(0);
	Index = N.getOperand(1);
	return true;
	}

	// Otherwise, do it the hard way, using R0 as the base register.
	Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
	N.getValueType());
	Index = N;
	return true;
	}

	/// getPreIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	if (DisablePPCPreinc) return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	unsigned Alignment;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	Ptr = LD->getBasePtr();
	VT = LD->getMemoryVT();
	Alignment = LD->getAlignment();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	Ptr = ST->getBasePtr();
	VT = ST->getMemoryVT();
	Alignment = ST->getAlignment();
	isLoad = false;
	} else
	return false;

	// PowerPC doesn't have preinc load/store instructions for vectors (except
	// for QPX, which does have preinc r+r forms).
	if (VT.isVector()) {
	if (!Subtarget.hasQPX() \|\| (VT != MVT::v4f64 && VT != MVT::v4f32)) {
	return false;
	} else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
	AM = ISD::PRE_INC;
	return true;
	}
	}

	if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
	// Common code will reject creating a pre-inc form if the base pointer
	// is a frame index, or if N is a store and the base pointer is either
	// the same as or a predecessor of the value being stored. Check for
	// those situations here, and try with swapped Base/Offset instead.
	bool Swap = false;

	if (isa<FrameIndexSDNode>(Base) \|\| isa<RegisterSDNode>(Base))
	Swap = true;
	else if (!isLoad) {
	SDValue Val = cast<StoreSDNode>(N)->getValue();
	if (Val == Base \|\| Base.getNode()->isPredecessorOf(Val.getNode()))
	Swap = true;
	}

	if (Swap)
	std::swap(Base, Offset);

	AM = ISD::PRE_INC;
	return true;
	}

	// LDU/STU can only handle immediates that are a multiple of 4.
	if (VT != MVT::i64) {
	if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
	return false;
	} else {
	// LDU/STU need an address with at least 4-byte alignment.
	if (Alignment < 4)
	return false;

	if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
	return false;
	}

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
	// sext i32 to i64 when addr mode is r+i.
	if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
	LD->getExtensionType() == ISD::SEXTLOAD &&
	isa<ConstantSDNode>(Offset))
	return false;
	}

	AM = ISD::PRE_INC;
	return true;
	}

	//===----------------------------------------------------------------------===//
	// LowerOperation implementation
	//===----------------------------------------------------------------------===//

	/// Return true if we should reference labels using a PICBase, set the HiOpFlags
	/// and LoOpFlags to the target MO flags.
	static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
	unsigned &HiOpFlags, unsigned &LoOpFlags,
	const GlobalValue *GV = nullptr) {
	HiOpFlags = PPCII::MO_HA;
	LoOpFlags = PPCII::MO_LO;

	// Don't use the pic base if not in PIC relocation model.
	if (IsPIC) {
	HiOpFlags \|= PPCII::MO_PIC_FLAG;
	LoOpFlags \|= PPCII::MO_PIC_FLAG;
	}

	// If this is a reference to a global value that requires a non-lazy-ptr, make
	// sure that instruction lowering adds it.
	if (GV && Subtarget.hasLazyResolverStub(GV)) {
	HiOpFlags \|= PPCII::MO_NLP_FLAG;
	LoOpFlags \|= PPCII::MO_NLP_FLAG;

	if (GV->hasHiddenVisibility()) {
	HiOpFlags \|= PPCII::MO_NLP_HIDDEN_FLAG;
	LoOpFlags \|= PPCII::MO_NLP_HIDDEN_FLAG;
	}
	}
	}

	static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
	SelectionDAG &DAG) {
	SDLoc DL(HiPart);
	EVT PtrVT = HiPart.getValueType();
	SDValue Zero = DAG.getConstant(0, DL, PtrVT);

	SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
	SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);

	// With PIC, the first instruction is actually "GR+hi(&G)".
	if (isPIC)
	Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);

	// Generate non-pic code that has direct accesses to the constant pool.
	// The address of the global is just (hi(&g)+lo(&g)).
	return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
	}

	static void setUsesTOCBasePtr(MachineFunction &MF) {
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setUsesTOCBasePtr();
	}

	static void setUsesTOCBasePtr(SelectionDAG &DAG) {
	setUsesTOCBasePtr(DAG.getMachineFunction());
	}

	static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
	SDValue GA) {
	EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
	SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
	DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);

	SDValue Ops[] = { GA, Reg };
	return DAG.getMemIntrinsicNode(
	PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
	MachineMemOperand::MOLoad);
	}

	SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
	const Constant *C = CP->getConstVal();

	// 64-bit SVR4 ABI code is always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
	return getTOCEntry(DAG, SDLoc(CP), true, GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, SDLoc(CP), false, GA);
	}

	SDValue CPIHi =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
	SDValue CPILo =
	DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
	return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
	}

	// For 64-bit PowerPC, prefer the more compact relative encodings.
	// This trades 32 bits per jump table entry for one or two instructions
	// on the jump site.
	unsigned PPCTargetLowering::getJumpTableEncoding() const {
	if (isJumpTableRelative())
	return MachineJumpTableInfo::EK_LabelDifference32;

	return TargetLowering::getJumpTableEncoding();
	}

	bool PPCTargetLowering::isJumpTableRelative() const {
	if (Subtarget.isPPC64())
	return true;
	return TargetLowering::isJumpTableRelative();
	}

	SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.isPPC64())
	return TargetLowering::getPICJumpTableRelocBase(Table, DAG);

	switch (getTargetMachine().getCodeModel()) {
	case CodeModel::Small:
	case CodeModel::Medium:
	return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
	default:
	return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	}
	}

	const MCExpr *
	PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI,
	MCContext &Ctx) const {
	if (!Subtarget.isPPC64())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	switch (getTargetMachine().getCodeModel()) {
	case CodeModel::Small:
	case CodeModel::Medium:
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
	default:
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}
	}

	SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// 64-bit SVR4 ABI code is always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
	return getTOCEntry(DAG, SDLoc(JT), true, GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, SDLoc(GA), false, GA);
	}

	SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
	SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
	return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
	const BlockAddress *BA = BASDN->getBlockAddress();

	// 64-bit SVR4 ABI code is always position-independent.
	// The actual BlockAddress is stored in the TOC.
	if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
	return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
	SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
	SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
	return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
	}

	SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	// FIXME: TLS addresses currently use medium model code sequences,
	// which is the most useful form. Eventually support for small and
	// large models could be added if users need it, at the cost of
	// additional complexity.
	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	SDLoc dl(GA);
	const GlobalValue *GV = GA->getGlobal();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool is64bit = Subtarget.isPPC64();
	const Module *M = DAG.getMachineFunction().getFunction().getParent();
	PICLevel::Level picLevel = M->getPICLevel();

	TLSModel::Model Model = getTargetMachine().getTLSModel(GV);

	if (Model == TLSModel::LocalExec) {
	SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TPREL_HA);
	SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TPREL_LO);
	SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
	: DAG.getRegister(PPC::R2, MVT::i32);

	SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
	return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
	}

	if (Model == TLSModel::InitialExec) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	PPCII::MO_TLS);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
	PtrVT, GOTReg, TGA);
	} else
	GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
	SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
	PtrVT, TGA, GOTPtr);
	return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
	}

	if (Model == TLSModel::GeneralDynamic) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
	GOTReg, TGA);
	} else {
	if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
	GOTPtr, TGA, TGA);
	}

	if (Model == TLSModel::LocalDynamic) {
	SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
	SDValue GOTPtr;
	if (is64bit) {
	setUsesTOCBasePtr(DAG);
	SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
	GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
	GOTReg, TGA);
	} else {
	if (picLevel == PICLevel::SmallPIC)
	GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
	else
	GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
	}
	SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
	PtrVT, GOTPtr, TGA, TGA);
	SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
	PtrVT, TLSAddr, TGA);
	return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
	}

	llvm_unreachable("Unknown TLS model!");
	}

	SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
	SDLoc DL(GSDN);
	const GlobalValue *GV = GSDN->getGlobal();

	// 64-bit SVR4 ABI code is always position-independent.
	// The actual address of the GlobalValue is stored in the TOC.
	if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
	setUsesTOCBasePtr(DAG);
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
	return getTOCEntry(DAG, DL, true, GA);
	}

	unsigned MOHiFlag, MOLoFlag;
	bool IsPIC = isPositionIndependent();
	getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);

	if (IsPIC && Subtarget.isSVR4ABI()) {
	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
	GSDN->getOffset(),
	PPCII::MO_PIC_FLAG);
	return getTOCEntry(DAG, DL, false, GA);
	}

	SDValue GAHi =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
	SDValue GALo =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);

	SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);

	// If the global reference is actually to a non-lazy-pointer, we have to do an
	// extra load to get the address of the global.
	if (MOHiFlag & PPCII::MO_NLP_FLAG)
	Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
	return Ptr;
	}

	SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	if (Op.getValueType() == MVT::v2i64) {
	// When the operands themselves are v2i64 values, we need to do something
	// special because VSX has no underlying comparison operations for these.
	if (Op.getOperand(0).getValueType() == MVT::v2i64) {
	// Equality can be handled by casting to the legal type for Altivec
	// comparisons, everything else needs to be expanded.
	if (CC == ISD::SETEQ \|\| CC == ISD::SETNE) {
	return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
	DAG.getSetCC(dl, MVT::v4i32,
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
	CC));
	}

	return SDValue();
	}

	// We handle most of these in the usual way.
	return Op;
	}

	// If we're comparing for equality to zero, expose the fact that this is
	// implemented as a ctlz/srl pair on ppc, so that the dag combiner can
	// fold the new nodes.
	if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
	return V;

	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	// Leave comparisons against 0 and -1 alone for now, since they're usually
	// optimized. FIXME: revisit this when we can custom lower all setcc
	// optimizations.
	if (C->isAllOnesValue() \|\| C->isNullValue())
	return SDValue();
	}

	// If we have an integer seteq/setne, turn it into a compare against zero
	// by xor'ing the rhs with the lhs, which is faster than setting a
	// condition register, reading it back out, and masking the correct bit. The
	// normal approach here uses sub to do this instead of xor. Using xor exposes
	// the result to other bit-twiddling opportunities.
	EVT LHSVT = Op.getOperand(0).getValueType();
	if (LHSVT.isInteger() && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	EVT VT = Op.getValueType();
	SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
	Op.getOperand(1));
	return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	SDNode *Node = Op.getNode();
	EVT VT = Node->getValueType(0);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue InChain = Node->getOperand(0);
	SDValue VAListPtr = Node->getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	SDLoc dl(Node);

	assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");

	// gpr_index
	SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
	VAListPtr, MachinePointerInfo(SV), MVT::i8);
	InChain = GprIndex.getValue(1);

	if (VT == MVT::i64) {
	// Check if GprIndex is even
	SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
	DAG.getConstant(1, dl, MVT::i32));
	SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
	DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
	SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
	DAG.getConstant(1, dl, MVT::i32));
	// Align GprIndex to be even if it isn't
	GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
	GprIndex);
	}

	// fpr index is 1 byte after gpr
	SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(1, dl, MVT::i32));

	// fpr
	SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
	FprPtr, MachinePointerInfo(SV), MVT::i8);
	InChain = FprIndex.getValue(1);

	SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(8, dl, MVT::i32));

	SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
	DAG.getConstant(4, dl, MVT::i32));

	// areas
	SDValue OverflowArea =
	DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
	InChain = OverflowArea.getValue(1);

	SDValue RegSaveArea =
	DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
	InChain = RegSaveArea.getValue(1);

	// select overflow_area if index > 8
	SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);

	// adjustment constant gpr_index * 4/8
	SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
	VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
	MVT::i32));

	// OurReg = RegSaveArea + RegConstant
	SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
	RegConstant);

	// Floating types are 32 bytes into RegSaveArea
	if (VT.isFloatingPoint())
	OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
	DAG.getConstant(32, dl, MVT::i32));

	// increase {f,g}pr_index by 1 (or 2 if VT is i64)
	SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
	VT.isInteger() ? GprIndex : FprIndex,
	DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
	MVT::i32));

	InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
	VT.isInteger() ? VAListPtr : FprPtr,
	MachinePointerInfo(SV), MVT::i8);

	// determine if we should load from reg_save_area or overflow_area
	SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);

	// increase overflow_area by 4/8 if gpr/fpr > 8
	SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
	DAG.getConstant(VT.isInteger() ? 4 : 8,
	dl, MVT::i32));

	OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
	OverflowAreaPlusN);

	InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
	MachinePointerInfo(), MVT::i32);

	return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
	assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");

	// We have to copy the entire va_list struct:
	// 2sizeof(char) + 2 Byte alignment + 2sizeof(char*) = 12 Byte
	return DAG.getMemcpy(Op.getOperand(0), Op,
	Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
	false, MachinePointerInfo(), MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	return Op.getOperand(0);
	}

	SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl(Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool isPPC64 = (PtrVT == MVT::i64);
	Type IntPtrTy = DAG.getDataLayout().getIntPtrType(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Ty = IntPtrTy;
	Entry.Node = Trmp; Args.push_back(Entry);

	// TrampSize == (isPPC64 ? 48 : 40);
	Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
	isPPC64 ? MVT::i64 : MVT::i32);
	Args.push_back(Entry);

	Entry.Node = FPtr; Args.push_back(Entry);
	Entry.Node = Nest; Args.push_back(Entry);

	// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
	CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	SDLoc dl(Op);

	if (Subtarget.isDarwinABI() \|\| Subtarget.isPPC64()) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
	// We suppose the given va_list is already allocated.
	//
	// typedef struct {
	// char gpr; /* index into the array of 8 GPRs
	// * stored in the register save area
	// * gpr=0 corresponds to r3,
	// * gpr=1 to r4, etc.
	// */
	// char fpr; /* index into the array of 8 FPRs
	// * stored in the register save area
	// * fpr=0 corresponds to f1,
	// * fpr=1 to f2, etc.
	// */
	// char *overflow_arg_area;
	// /* location on stack that holds
	// * the next overflow argument
	// */
	// char *reg_save_area;
	// /* where r3:r10 and f1:f8 (if saved)
	// * are stored
	// */
	// } va_list[1];

	SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
	SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
	SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
	PtrVT);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	PtrVT);

	uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
	SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);

	uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
	SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);

	uint64_t FPROffset = 1;
	SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

	// Store first byte : number of int regs
	SDValue firstStore =
	DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
	MachinePointerInfo(SV), MVT::i8);
	uint64_t nextOffset = FPROffset;
	SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
	ConstFPROffset);

	// Store second byte : number of float regs
	SDValue secondStore =
	DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
	MachinePointerInfo(SV, nextOffset), MVT::i8);
	nextOffset += StackOffset;
	nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);

	// Store second word : arguments given on stack
	SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
	MachinePointerInfo(SV, nextOffset));
	nextOffset += FrameOffset;
	nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);

	// Store third word : arguments given in registers
	return DAG.getStore(thirdStore, dl, FR, nextPtr,
	MachinePointerInfo(SV, nextOffset));
	}

	#include "PPCGenCallingConv.inc"

	// Function whose sole purpose is to kill compiler warnings
	// stemming from unused functions included from PPCGenCallingConv.inc.
	CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
	return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
	}

	bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State) {
	return true;
	}

	bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State) {
	static const MCPhysReg ArgRegs[] = {
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	const unsigned NumArgRegs = array_lengthof(ArgRegs);

	unsigned RegNum = State.getFirstUnallocated(ArgRegs);

	// Skip one register if the first unallocated register has an even register
	// number and there are still argument registers available which have not been
	// allocated yet. RegNum is actually an index into ArgRegs, which means we
	// need to skip a register if RegNum is odd.
	if (RegNum != NumArgRegs && RegNum % 2 == 1) {
	State.AllocateReg(ArgRegs[RegNum]);
	}

	// Always return false here, as this function only makes sure that the first
	// unallocated register has an odd register number and does not actually
	// allocate a register for the current argument.
	return false;
	}

	bool
	llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State) {
	static const MCPhysReg ArgRegs[] = {
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	const unsigned NumArgRegs = array_lengthof(ArgRegs);

	unsigned RegNum = State.getFirstUnallocated(ArgRegs);
	int RegsLeft = NumArgRegs - RegNum;

	// Skip if there is not enough registers left for long double type (4 gpr regs
	// in soft float mode) and put long double argument on the stack.
	if (RegNum != NumArgRegs && RegsLeft < 4) {
	for (int i = 0; i < RegsLeft; i++) {
	State.AllocateReg(ArgRegs[RegNum + i]);
	}
	}

	return false;
	}

	bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State) {
	static const MCPhysReg ArgRegs[] = {
	PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
	PPC::F8
	};

	const unsigned NumArgRegs = array_lengthof(ArgRegs);

	unsigned RegNum = State.getFirstUnallocated(ArgRegs);

	// If there is only one Floating-point register left we need to put both f64
	// values of a split ppc_fp128 value on the stack.
	if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
	State.AllocateReg(ArgRegs[RegNum]);
	}

	// Always return false here, as this function only makes sure that the two f64
	// values a ppc_fp128 value is split into are both passed in registers or both
	// passed on the stack and does not actually allocate a register for the
	// current argument.
	return false;
	}

	/// FPR - The set of FP registers that should be allocated for arguments,
	/// on Darwin.
	static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
	PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
	PPC::F11, PPC::F12, PPC::F13};

	/// QFPR - The set of QPX registers that should be allocated for arguments.
	static const MCPhysReg QFPR[] = {
	PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
	PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};

	/// CalculateStackSlotSize - Calculates the size reserved for this argument on
	/// the stack.
	static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize) {
	unsigned ArgSize = ArgVT.getStoreSize();
	if (Flags.isByVal())
	ArgSize = Flags.getByValSize();

	// Round up to multiples of the pointer size, except for array members,
	// which are always packed.
	if (!Flags.isInConsecutiveRegs())
	ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;

	return ArgSize;
	}

	/// CalculateStackSlotAlignment - Calculates the alignment of this argument
	/// on the stack.
	static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
	ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize) {
	unsigned Align = PtrByteSize;

	// Altivec parameters are padded to a 16 byte boundary.
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64 \|\|
	ArgVT == MVT::v1i128)
	Align = 16;
	// QPX vector types stored in double-precision are padded to a 32 byte
	// boundary.
	else if (ArgVT == MVT::v4f64 \|\| ArgVT == MVT::v4i1)
	Align = 32;

	// ByVal parameters are aligned as requested.
	if (Flags.isByVal()) {
	unsigned BVAlign = Flags.getByValAlign();
	if (BVAlign > PtrByteSize) {
	if (BVAlign % PtrByteSize != 0)
	llvm_unreachable(
	"ByVal alignment is not a multiple of the pointer size");

	Align = BVAlign;
	}
	}

	// Array members are always packed to their original alignment.
	if (Flags.isInConsecutiveRegs()) {
	// If the array member was split into multiple registers, the first
	// needs to be aligned to the size of the full type. (Except for
	// ppcf128, which is only aligned as its f64 components.)
	if (Flags.isSplit() && OrigVT != MVT::ppcf128)
	Align = OrigVT.getStoreSize();
	else
	Align = ArgVT.getStoreSize();
	}

	return Align;
	}

	/// CalculateStackSlotUsed - Return whether this argument will use its
	/// stack slot (instead of being passed in registers). ArgOffset,
	/// AvailableFPRs, and AvailableVRs must hold the current argument
	/// position, and will be updated to account for this argument.
	static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
	ISD::ArgFlagsTy Flags,
	unsigned PtrByteSize,
	unsigned LinkageSize,
	unsigned ParamAreaSize,
	unsigned &ArgOffset,
	unsigned &AvailableFPRs,
	unsigned &AvailableVRs, bool HasQPX) {
	bool UseMemory = false;

	// Respect alignment of argument on the stack.
	unsigned Align =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
	// If there's no space left in the argument save area, we must
	// use memory (this check also catches zero-sized arguments).
	if (ArgOffset >= LinkageSize + ParamAreaSize)
	UseMemory = true;

	// Allocate argument on the stack.
	ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// If we overran the argument save area, we must use memory
	// (this check catches arguments passed partially in memory)
	if (ArgOffset > LinkageSize + ParamAreaSize)
	UseMemory = true;

	// However, if the argument is actually passed in an FPR or a VR,
	// we don't use memory after all.
	if (!Flags.isByVal()) {
	if (ArgVT == MVT::f32 \|\| ArgVT == MVT::f64 \|\|
	// QPX registers overlap with the scalar FP registers.
	(HasQPX && (ArgVT == MVT::v4f32 \|\|
	ArgVT == MVT::v4f64 \|\|
	ArgVT == MVT::v4i1)))
	if (AvailableFPRs > 0) {
	--AvailableFPRs;
	return false;
	}
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64 \|\|
	ArgVT == MVT::v1i128)
	if (AvailableVRs > 0) {
	--AvailableVRs;
	return false;
	}
	}

	return UseMemory;
	}

	/// EnsureStackAlignment - Round stack frame size up from NumBytes to
	/// ensure minimum alignment required for target.
	static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
	unsigned NumBytes) {
	unsigned TargetAlign = Lowering->getStackAlignment();
	unsigned AlignMask = TargetAlign - 1;
	NumBytes = (NumBytes + AlignMask) & ~AlignMask;
	return NumBytes;
	}

	SDValue PPCTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	if (Subtarget.isSVR4ABI()) {
	if (Subtarget.isPPC64())
	return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
	dl, DAG, InVals);
	else
	return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
	dl, DAG, InVals);
	} else {
	return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
	dl, DAG, InVals);
	}
	}

	SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	// 32-bit SVR4 ABI Stack Frame Layout:
	// +-----------------------------------+
	// +--> \| Back chain \|
	// \| +-----------------------------------+
	// \| \| Floating-point register save area \|
	// \| +-----------------------------------+
	// \| \| General register save area \|
	// \| +-----------------------------------+
	// \| \| CR save word \|
	// \| +-----------------------------------+
	// \| \| VRSAVE save word \|
	// \| +-----------------------------------+
	// \| \| Alignment padding \|
	// \| +-----------------------------------+
	// \| \| Vector register save area \|
	// \| +-----------------------------------+
	// \| \| Local variable space \|
	// \| +-----------------------------------+
	// \| \| Parameter list area \|
	// \| +-----------------------------------+
	// \| \| LR save word \|
	// \| +-----------------------------------+
	// SP--> +--- \| Back chain \|
	// +-----------------------------------+
	//
	// Specifications:
	// System V Application Binary Interface PowerPC Processor Supplement
	// AltiVec Technology Programming Interface Manual

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = 4;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// Reserve space for the linkage area on the stack.
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	CCInfo.AllocateStack(LinkageSize, PtrByteSize);
	if (useSoftFloat())
	CCInfo.PreAnalyzeFormalArguments(Ins);

	CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
	CCInfo.clearWasPPCF128();

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	// Arguments stored in registers.
	if (VA.isRegLoc()) {
	const TargetRegisterClass *RC;
	EVT ValVT = VA.getValVT();

	switch (ValVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("ValVT not supported by formal arguments Lowering");
	case MVT::i1:
	case MVT::i32:
	RC = &PPC::GPRCRegClass;
	break;
	case MVT::f32:
	if (Subtarget.hasP8Vector())
	RC = &PPC::VSSRCRegClass;
	else
	RC = &PPC::F4RCRegClass;
	break;
	case MVT::f64:
	if (Subtarget.hasVSX())
	RC = &PPC::VSFRCRegClass;
	else
	RC = &PPC::F8RCRegClass;
	break;
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	RC = &PPC::VRRCRegClass;
	break;
	case MVT::v4f32:
	RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	RC = &PPC::VRRCRegClass;
	break;
	case MVT::v4f64:
	RC = &PPC::QFRCRegClass;
	break;
	case MVT::v4i1:
	RC = &PPC::QBRCRegClass;
	break;
	}

	// Transform the arguments stored in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
	ValVT == MVT::i1 ? MVT::i32 : ValVT);

	if (ValVT == MVT::i1)
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);

	InVals.push_back(ArgValue);
	} else {
	// Argument stored in memory.
	assert(VA.isMemLoc());

	unsigned ArgSize = VA.getLocVT().getStoreSize();
	int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(),
	isImmutable);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(
	DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
	}
	}

	// Assign locations to all of the incoming aggregate by value arguments.
	// Aggregates passed by value are stored in the local variable space of the
	// caller's stack frame, right above the parameter list area.
	SmallVector<CCValAssign, 16> ByValArgLocs;
	CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
	ByValArgLocs, *DAG.getContext());

	// Reserve stack space for the allocations in CCInfo.
	CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);

	CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
	MinReservedArea = std::max(MinReservedArea, LinkageSize);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized function's reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	SmallVector<SDValue, 8> MemOps;

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	static const MCPhysReg GPArgRegs[] = {
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);

	static const MCPhysReg FPArgRegs[] = {
	PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
	PPC::F8
	};
	unsigned NumFPArgRegs = array_lengthof(FPArgRegs);

	if (useSoftFloat())
	NumFPArgRegs = 0;

	FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
	FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));

	// Make room for NumGPArgRegs and NumFPArgRegs.
	int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
	NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;

	FuncInfo->setVarArgsStackOffset(
	MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
	CCInfo.getNextStackOffset(), true));

	FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// The fixed integer arguments of a variadic function are stored to the
	// VarArgsFrameIndex on the stack so that they may be loaded by
	// dereferencing the result of va_next.
	for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
	// Get an existing live-in vreg, or add a new one.
	unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
	if (!VReg)
	VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}

	// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
	// is set.
	// The double arguments are stored to the VarArgsFrameIndex
	// on the stack.
	for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
	// Get an existing live-in vreg, or add a new one.
	unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
	if (!VReg)
	VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by eight for the next argument to store
	SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
	PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
	EVT ObjectVT, SelectionDAG &DAG,
	SDValue ArgVal,
	const SDLoc &dl) const {
	if (Flags.isSExt())
	ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
	DAG.getValueType(ObjectVT));
	else if (Flags.isZExt())
	ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
	DAG.getValueType(ObjectVT));

	return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
	}

	SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	// TODO: add description of PPC stack frame format, or at least some docs.
	//
	bool isELFv2ABI = Subtarget.isELFv2ABI();
	bool isLittleEndian = Subtarget.isLittleEndian();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	assert(!(CallConv == CallingConv::Fast && isVarArg) &&
	"fastcc not supported on varargs functions");

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = 8;
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned Num_GPR_Regs = array_lengthof(GPR);
	const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
	const unsigned Num_VR_Regs = array_lengthof(VR);
	const unsigned Num_QFPR_Regs = Num_FPR_Regs;

	// Do a first pass over the arguments to determine whether the ABI
	// guarantees that our caller has allocated the parameter save area
	// on its stack frame. In the ELFv1 ABI, this is always the case;
	// in the ELFv2 ABI, it is true if this is a vararg function or if
	// any parameter is located in a stack slot.

	bool HasParameterArea = !isELFv2ABI \|\| isVarArg;
	unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
	unsigned NumBytes = LinkageSize;
	unsigned AvailableFPRs = Num_FPR_Regs;
	unsigned AvailableVRs = Num_VR_Regs;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (Ins[i].Flags.isNest())
	continue;

	if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytes, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	HasParameterArea = true;
	}

	// Add DAG nodes to load the arguments or copy them out of registers. On
	// entry to a function on PPC, the arguments start after the linkage area,
	// although the first ones are often in registers.

	unsigned ArgOffset = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
	unsigned &QFPR_idx = FPR_idx;
	SmallVector<SDValue, 8> MemOps;
	Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
	SDValue ArgVal;
	bool needsLoad = false;
	EVT ObjectVT = Ins[ArgNo].VT;
	EVT OrigVT = Ins[ArgNo].ArgVT;
	unsigned ObjSize = ObjectVT.getStoreSize();
	unsigned ArgSize = ObjSize;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
	if (Ins[ArgNo].isOrigArg()) {
	std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[ArgNo].getOrigArgIndex();
	}
	// We re-align the argument offset for each argument, except when using the
	// fast calling convention, when we need to make sure we do that only when
	// we'll actually use a stack slot.
	unsigned CurArgOffset, Align;
	auto ComputeArgOffset = [&]() {
	/* Respect alignment of argument on the stack. */
	Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
	CurArgOffset = ArgOffset;
	};

	if (CallConv != CallingConv::Fast) {
	ComputeArgOffset();

	/* Compute GPR index associated with argument offset. */
	GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
	}

	// FIXME the codegen can be much improved in some cases.
	// We do not have to keep everything in memory.
	if (Flags.isByVal()) {
	assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");

	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	// ObjSize is the true size, ArgSize rounded up to multiple of registers.
	ObjSize = Flags.getByValSize();
	ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// Empty aggregate parameters do not take up registers. Examples:
	// struct { } a;
	// union { } b;
	// int c[0];
	// etc. However, we have to provide a place-holder in InVals, so
	// pretend we have an 8-byte item at the current address for that
	// purpose.
	if (!ObjSize) {
	int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);
	continue;
	}

	// Create a stack object covering all stack doublewords occupied
	// by the argument. If the argument is (fully or partially) on
	// the stack, or if the argument is fully in registers but the
	// caller has allocated the parameter save anyway, we can refer
	// directly to the caller's stack frame. Otherwise, create a
	// local copy in our own frame.
	int FI;
	if (HasParameterArea \|\|
	ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
	FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
	else
	FI = MFI.CreateStackObject(ArgSize, Align, false);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

	// Handle aggregates smaller than 8 bytes.
	if (ObjSize < PtrByteSize) {
	// The value of the object is its address, which differs from the
	// address of the enclosing doubleword on big-endian systems.
	SDValue Arg = FIN;
	if (!isLittleEndian) {
	SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
	Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
	}
	InVals.push_back(Arg);

	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store;

	if (ObjSize==1 \|\| ObjSize==2 \|\| ObjSize==4) {
	EVT ObjType = (ObjSize == 1 ? MVT::i8 :
	(ObjSize == 2 ? MVT::i16 : MVT::i32));
	Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
	MachinePointerInfo(&*FuncArg), ObjType);
	} else {
	// For sizes that don't fit a truncating store (3, 5, 6, 7),
	// store the whole register as-is to the parameter save area
	// slot.
	Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg));
	}

	MemOps.push_back(Store);
	}
	// Whether we copied from a register or not, advance the offset
	// into the parameter save area by a full doubleword.
	ArgOffset += PtrByteSize;
	continue;
	}

	// The value of the object is its address, which is the address of
	// its first stack doubleword.
	InVals.push_back(FIN);

	// Store whatever pieces of the object are in registers to memory.
	for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
	if (GPR_idx == Num_GPR_Regs)
	break;

	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Addr = FIN;
	if (j) {
	SDValue Off = DAG.getConstant(j, dl, PtrVT);
	Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
	}
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
	MachinePointerInfo(&*FuncArg, j));
	MemOps.push_back(Store);
	++GPR_idx;
	}
	ArgOffset += ArgSize;
	continue;
	}

	switch (ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (Flags.isNest()) {
	// The 'nest' parameter, if any, is passed in R11.
	unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);

	break;
	}

	// These can be scalar arguments or elements of an integer array type
	// passed directly. Clang may use those instead of "byval" aggregate
	// types to avoid forcing arguments to memory unnecessarily.
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += 8;
	break;

	case MVT::f32:
	case MVT::f64:
	// These can be scalar arguments or elements of a float array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// float aggregates.
	if (FPR_idx != Num_FPR_Regs) {
	unsigned VReg;

	if (ObjectVT == MVT::f32)
	VReg = MF.addLiveIn(FPR[FPR_idx],
	Subtarget.hasP8Vector()
	? &PPC::VSSRCRegClass
	: &PPC::F4RCRegClass);
	else
	VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
	? &PPC::VSFRCRegClass
	: &PPC::F8RCRegClass);

	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++FPR_idx;
	} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
	// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
	// once we support fp <-> gpr moves.

	// This can only ever happen in the presence of f32 array types,
	// since otherwise we never run out of FPRs before running out
	// of GPRs.
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
	FuncInfo->addLiveInAttr(VReg, Flags);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::f32) {
	if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
	ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
	DAG.getConstant(32, dl, MVT::i32));
	ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
	}

	ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	}

	// When passing an array of floats, the array occupies consecutive
	// space in the argument area; only round up to the next doubleword
	// at the end of the array. Otherwise, each float takes 8 bytes.
	if (CallConv != CallingConv::Fast \|\| needsLoad) {
	ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
	ArgOffset += ArgSize;
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	if (!Subtarget.hasQPX()) {
	// These can be scalar arguments or elements of a vector array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// vector aggregates.
	if (VR_idx != Num_VR_Regs) {
	unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++VR_idx;
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();

	needsLoad = true;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += 16;
	break;
	} // not QPX

	assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
	"Invalid QPX parameter type");
	/* fall through */

	case MVT::v4f64:
	case MVT::v4i1:
	// QPX vectors are treated like their scalar floating-point subregisters
	// (except that they're larger).
	unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
	if (QFPR_idx != Num_QFPR_Regs) {
	const TargetRegisterClass *RC;
	switch (ObjectVT.getSimpleVT().SimpleTy) {
	case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
	case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
	default: RC = &PPC::QBRCRegClass; break;
	}

	unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++QFPR_idx;
	} else {
	if (CallConv == CallingConv::Fast)
	ComputeArgOffset();
	needsLoad = true;
	}
	if (CallConv != CallingConv::Fast \|\| needsLoad)
	ArgOffset += Sz;
	break;
	}

	// We need to load the argument to a virtual register if we determined
	// above that we ran out of physical registers of the appropriate type.
	if (needsLoad) {
	if (ObjSize < ArgSize && !isLittleEndian)
	CurArgOffset += ArgSize - ObjSize;
	int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
	}

	InVals.push_back(ArgVal);
	}

	// Area that is at least reserved in the caller of this function.
	unsigned MinReservedArea;
	if (HasParameterArea)
	MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
	else
	MinReservedArea = LinkageSize;

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized functions' reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	int Depth = ArgOffset;

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrByteSize, Depth, true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// If this function is vararg, store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx < Num_GPR_Regs; ++GPR_idx) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	// TODO: add description of PPC stack frame format, or at least some docs.
	//
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;
	// Potential tail calls could cause overwriting of argument stack slots.
	bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
	(CallConv == CallingConv::Fast));
	unsigned PtrByteSize = isPPC64 ? 8 : 4;
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned ArgOffset = LinkageSize;
	// Area that is at least reserved in caller of this function.
	unsigned MinReservedArea = ArgOffset;

	static const MCPhysReg GPR_32[] = { // 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	static const MCPhysReg GPR_64[] = { // 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
	const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
	const unsigned Num_VR_Regs = array_lengthof( VR);

	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;

	const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;

	// In 32-bit non-varargs functions, the stack space for vectors is after the
	// stack space for non-vectors. We do not use this space unless we have
	// too many vectors to fit in registers, something that only occurs in
	// constructed examples:), but we have to walk the arglist to figure
	// that out...for the pathological case, compute VecArgOffset as the
	// start of the vector parameter area. Computing VecArgOffset is the
	// entire point of the following loop.
	unsigned VecArgOffset = ArgOffset;
	if (!isVarArg && !isPPC64) {
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
	++ArgNo) {
	EVT ObjectVT = Ins[ArgNo].VT;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;

	if (Flags.isByVal()) {
	// ObjSize is the true size, ArgSize rounded up to multiple of regs.
	unsigned ObjSize = Flags.getByValSize();
	unsigned ArgSize =
	((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	VecArgOffset += ArgSize;
	continue;
	}

	switch(ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	case MVT::f32:
	VecArgOffset += 4;
	break;
	case MVT::i64: // PPC64
	case MVT::f64:
	// FIXME: We are guaranteed to be !isPPC64 at this point.
	// Does MVT::i64 apply?
	VecArgOffset += 8;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	// Nothing to do, we're only looking at Nonvector args here.
	break;
	}
	}
	}
	// We've found where the vector parameter area in memory is. Skip the
	// first 12 parameters; these don't use that memory.
	VecArgOffset = ((VecArgOffset+15)/16)*16;
	VecArgOffset += 12*16;

	// Add DAG nodes to load the arguments or copy them out of registers. On
	// entry to a function on PPC, the arguments start after the linkage area,
	// although the first ones are often in registers.

	SmallVector<SDValue, 8> MemOps;
	unsigned nAltivecParamsAtEnd = 0;
	Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
	SDValue ArgVal;
	bool needsLoad = false;
	EVT ObjectVT = Ins[ArgNo].VT;
	unsigned ObjSize = ObjectVT.getSizeInBits()/8;
	unsigned ArgSize = ObjSize;
	ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
	if (Ins[ArgNo].isOrigArg()) {
	std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[ArgNo].getOrigArgIndex();
	}
	unsigned CurArgOffset = ArgOffset;

	// Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
	if (ObjectVT==MVT::v4f32 \|\| ObjectVT==MVT::v4i32 \|\|
	ObjectVT==MVT::v8i16 \|\| ObjectVT==MVT::v16i8) {
	if (isVarArg \|\| isPPC64) {
	MinReservedArea = ((MinReservedArea+15)/16)*16;
	MinReservedArea += CalculateStackSlotSize(ObjectVT,
	Flags,
	PtrByteSize);
	} else nAltivecParamsAtEnd++;
	} else
	// Calculate min reserved area.
	MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
	Flags,
	PtrByteSize);

	// FIXME the codegen can be much improved in some cases.
	// We do not have to keep everything in memory.
	if (Flags.isByVal()) {
	assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");

	// ObjSize is the true size, ArgSize rounded up to multiple of registers.
	ObjSize = Flags.getByValSize();
	ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	// Objects of size 1 and 2 are right justified, everything else is
	// left justified. This means the memory address is adjusted forwards.
	if (ObjSize==1 \|\| ObjSize==2) {
	CurArgOffset = CurArgOffset + (4 - ObjSize);
	}
	// The value of the object is its address.
	int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(FIN);
	if (ObjSize==1 \|\| ObjSize==2) {
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg;
	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
	SDValue Store =
	DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg), ObjType);
	MemOps.push_back(Store);
	++GPR_idx;
	}

	ArgOffset += PtrByteSize;

	continue;
	}
	for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
	// Store whatever pieces of the object are in registers
	// to memory. ArgOffset will be the address of the beginning
	// of the object.
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg;
	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(&*FuncArg, j));
	MemOps.push_back(Store);
	++GPR_idx;
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
	break;
	}
	}
	continue;
	}

	switch (ObjectVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unhandled argument type!");
	case MVT::i1:
	case MVT::i32:
	if (!isPPC64) {
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);

	if (ObjectVT == MVT::i1)
	ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);

	++GPR_idx;
	} else {
	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	// All int arguments reserve stack space in the Darwin ABI.
	ArgOffset += PtrByteSize;
	break;
	}
	LLVM_FALLTHROUGH;
	case MVT::i64: // PPC64
	if (GPR_idx != Num_GPR_Regs) {
	unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);

	if (ObjectVT == MVT::i32 \|\| ObjectVT == MVT::i1)
	// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
	// value to MVT::i64 and then truncate to the correct register size.
	ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);

	++GPR_idx;
	} else {
	needsLoad = true;
	ArgSize = PtrByteSize;
	}
	// All int arguments reserve stack space in the Darwin ABI.
	ArgOffset += 8;
	break;

	case MVT::f32:
	case MVT::f64:
	// Every 4 bytes of argument space consumes one of the GPRs available for
	// argument passing.
	if (GPR_idx != Num_GPR_Regs) {
	++GPR_idx;
	if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
	++GPR_idx;
	}
	if (FPR_idx != Num_FPR_Regs) {
	unsigned VReg;

	if (ObjectVT == MVT::f32)
	VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
	else
	VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);

	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	++FPR_idx;
	} else {
	needsLoad = true;
	}

	// All FP arguments reserve stack space in the Darwin ABI.
	ArgOffset += isPPC64 ? 8 : ObjSize;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	// Note that vector arguments in registers don't reserve stack space,
	// except in varargs functions.
	if (VR_idx != Num_VR_Regs) {
	unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
	ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
	if (isVarArg) {
	while ((ArgOffset % 16) != 0) {
	ArgOffset += PtrByteSize;
	if (GPR_idx != Num_GPR_Regs)
	GPR_idx++;
	}
	ArgOffset += 16;
	GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
	}
	++VR_idx;
	} else {
	if (!isVarArg && !isPPC64) {
	// Vectors go after all the nonvectors.
	CurArgOffset = VecArgOffset;
	VecArgOffset += 16;
	} else {
	// Vectors are aligned.
	ArgOffset = ((ArgOffset+15)/16)*16;
	CurArgOffset = ArgOffset;
	ArgOffset += 16;
	}
	needsLoad = true;
	}
	break;
	}

	// We need to load the argument to a virtual register if we determined above
	// that we ran out of physical registers of the appropriate type.
	if (needsLoad) {
	int FI = MFI.CreateFixedObject(ObjSize,
	CurArgOffset + (ArgSize - ObjSize),
	isImmutable);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
	}

	InVals.push_back(ArgVal);
	}

	// Allow for Altivec parameters at the end, if needed.
	if (nAltivecParamsAtEnd) {
	MinReservedArea = ((MinReservedArea+15)/16)*16;
	MinReservedArea += 16*nAltivecParamsAtEnd;
	}

	// Area that is at least reserved in the caller of this function.
	MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);

	// Set the size that is at least reserved in caller of this function. Tail
	// call optimized functions' reserved stack space needs to be aligned so that
	// taking the difference between two stack areas will result in an aligned
	// stack.
	MinReservedArea =
	EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
	FuncInfo->setMinReservedArea(MinReservedArea);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	int Depth = ArgOffset;

	FuncInfo->setVarArgsFrameIndex(
	MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
	Depth, true));
	SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

	// If this function is vararg, store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
	unsigned VReg;

	if (isPPC64)
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
	else
	VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);

	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
	MemOps.push_back(Store);
	// Increment the address by four for the next argument to store
	SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
	FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
	}
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

	return Chain;
	}

	/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
	/// adjusted to accommodate the arguments for the tailcall.
	static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
	unsigned ParamSize) {

	if (!isTailCall) return 0;

	PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
	unsigned CallerMinReservedArea = FI->getMinReservedArea();
	int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
	// Remember only if the new adjustement is bigger.
	if (SPDiff < FI->getTailCallSPDelta())
	FI->setTailCallSPDelta(SPDiff);

	return SPDiff;
	}

	static bool isFunctionGlobalAddress(SDValue Callee);

	static bool
	callsShareTOCBase(const Function *Caller, SDValue Callee,
	const TargetMachine &TM) {
	// If !G, Callee can be an external symbol.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G)
	return false;

	// The medium and large code models are expected to provide a sufficiently
	// large TOC to provide all data addressing needs of a module with a
	// single TOC. Since each module will be addressed with a single TOC then we
	// only need to check that caller and callee don't cross dso boundaries.
	if (CodeModel::Medium == TM.getCodeModel() \|\|
	CodeModel::Large == TM.getCodeModel())
	return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());

	// Otherwise we need to ensure callee and caller are in the same section,
	// since the linker may allocate multiple TOCs, and we don't know which
	// sections will belong to the same TOC base.

	const GlobalValue *GV = G->getGlobal();
	if (!GV->isStrongDefinitionForLinker())
	return false;

	// Any explicitly-specified sections and section prefixes must also match.
	// Also, if we're using -ffunction-sections, then each function is always in
	// a different section (the same is true for COMDAT functions).
	if (TM.getFunctionSections() \|\| GV->hasComdat() \|\| Caller->hasComdat() \|\|
	GV->getSection() != Caller->getSection())
	return false;
	if (const auto *F = dyn_cast<Function>(GV)) {
	if (F->getSectionPrefix() != Caller->getSectionPrefix())
	return false;
	}

	// If the callee might be interposed, then we can't assume the ultimate call
	// target will be in the same section. Even in cases where we can assume that
	// interposition won't happen, in any case where the linker might insert a
	// stub to allow for interposition, we must generate code as though
	// interposition might occur. To understand why this matters, consider a
	// situation where: a -> b -> c where the arrows indicate calls. b and c are
	// in the same section, but a is in a different module (i.e. has a different
	// TOC base pointer). If the linker allows for interposition between b and c,
	// then it will generate a stub for the call edge between b and c which will
	// save the TOC pointer into the designated stack slot allocated by b. If we
	// return true here, and therefore allow a tail call between b and c, that
	// stack slot won't exist and the b -> c stub will end up saving b'c TOC base
	// pointer into the stack slot allocated by a (where the a -> b stub saved
	// a's TOC base pointer). If we're not considering a tail call, but rather,
	// whether a nop is needed after the call instruction in b, because the linker
	// will insert a stub, it might complain about a missing nop if we omit it
	// (although many don't complain in this case).
	if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
	return false;

	return true;
	}

	static bool
	needStackSlotPassParameters(const PPCSubtarget &Subtarget,
	const SmallVectorImpl<ISD::OutputArg> &Outs) {
	assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());

	const unsigned PtrByteSize = 8;
	const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned NumGPRs = array_lengthof(GPR);
	const unsigned NumFPRs = 13;
	const unsigned NumVRs = array_lengthof(VR);
	const unsigned ParamAreaSize = NumGPRs * PtrByteSize;

	unsigned NumBytes = LinkageSize;
	unsigned AvailableFPRs = NumFPRs;
	unsigned AvailableVRs = NumVRs;

	for (const ISD::OutputArg& Param : Outs) {
	if (Param.Flags.isNest()) continue;

	if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytes, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	return true;
	}
	return false;
	}

	static bool
	hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
	if (CS.arg_size() != CallerFn->arg_size())
	return false;

	ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
	ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
	Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();

	for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
	const Value* CalleeArg = *CalleeArgIter;
	const Value* CallerArg = &(*CallerArgIter);
	if (CalleeArg == CallerArg)
	continue;

	// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
	// tail call @callee([4 x i64] undef, [4 x i64] %b)
	// }
	// 1st argument of callee is undef and has the same type as caller.
	if (CalleeArg->getType() == CallerArg->getType() &&
	isa<UndefValue>(CalleeArg))
	continue;

	return false;
	}

	return true;
	}

	// Returns true if TCO is possible between the callers and callees
	// calling conventions.
	static bool
	areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
	CallingConv::ID CalleeCC) {
	// Tail calls are possible with fastcc and ccc.
	auto isTailCallableCC = [] (CallingConv::ID CC){
	return CC == CallingConv::C \|\| CC == CallingConv::Fast;
	};
	if (!isTailCallableCC(CallerCC) \|\| !isTailCallableCC(CalleeCC))
	return false;

	// We can safely tail call both fastcc and ccc callees from a c calling
	// convention caller. If the caller is fastcc, we may have less stack space
	// than a non-fastcc caller with the same signature so disable tail-calls in
	// that case.
	return CallerCC == CallingConv::C \|\| CallerCC == CalleeCC;
	}

	bool
	PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
	SDValue Callee,
	CallingConv::ID CalleeCC,
	ImmutableCallSite CS,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const {
	bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;

	if (DisableSCO && !TailCallOpt) return false;

	// Variadic argument functions are not supported.
	if (isVarArg) return false;

	auto &Caller = DAG.getMachineFunction().getFunction();
	// Check that the calling conventions are compatible for tco.
	if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
	return false;

	// Caller contains any byval parameter is not supported.
	if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
	return false;

	// Callee contains any byval parameter is not supported, too.
	// Note: This is a quick work around, because in some cases, e.g.
	// caller's stack size > callee's stack size, we are still able to apply
	// sibling call optimization. For example, gcc is able to do SCO for caller1
	// in the following example, but not for caller2.
	// struct test {
	// long int a;
	// char ary[56];
	// } gTest;
	// __attribute__((noinline)) int callee(struct test v, struct test *b) {
	// b->a = v.a;
	// return 0;
	// }
	// void caller1(struct test a, struct test c, struct test *b) {
	// callee(gTest, b); }
	// void caller2(struct test *b) { callee(gTest, b); }
	if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
	return false;

	// If callee and caller use different calling conventions, we cannot pass
	// parameters on stack since offsets for the parameter area may be different.
	if (Caller.getCallingConv() != CalleeCC &&
	needStackSlotPassParameters(Subtarget, Outs))
	return false;

	// No TCO/SCO on indirect call because Caller have to restore its TOC
	if (!isFunctionGlobalAddress(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee))
	return false;

	// If the caller and callee potentially have different TOC bases then we
	// cannot tail call since we need to restore the TOC pointer after the call.
	// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
	if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
	return false;

	// TCO allows altering callee ABI, so we don't have to check further.
	if (CalleeCC == CallingConv::Fast && TailCallOpt)
	return true;

	if (DisableSCO) return false;

	// If callee use the same argument list that caller is using, then we can
	// apply SCO on this case. If it is not, then we need to check if callee needs
	// stack for passing arguments.
	if (!hasSameArgumentList(&Caller, CS) &&
	needStackSlotPassParameters(Subtarget, Outs)) {
	return false;
	}

	return true;
	}

	/// IsEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization. Targets which want to do tail call
	/// optimization should implement this function.
	bool
	PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const {
	if (!getTargetMachine().Options.GuaranteedTailCallOpt)
	return false;

	// Variable argument functions are not supported.
	if (isVarArg)
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
	if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
	// Functions containing by val parameters are not supported.
	for (unsigned i = 0; i != Ins.size(); i++) {
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	if (Flags.isByVal()) return false;
	}

	// Non-PIC/GOT tail calls are supported.
	if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
	return true;

	// At the moment we can only do local tail calls (in same module, hidden
	// or protected) if we are generating PIC.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	return G->getGlobal()->hasHiddenVisibility()
	\|\| G->getGlobal()->hasProtectedVisibility();
	}

	return false;
	}

	/// isCallCompatibleAddress - Return the immediate to use if the specified
	/// 32-bit value is representable in the immediate field of a BxA instruction.
	static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C) return nullptr;

	int Addr = C->getZExtValue();
	if ((Addr & 3) != 0 \|\| // Low 2 bits are implicitly zero.
	SignExtend32<26>(Addr) != Addr)
	return nullptr; // Top 6 bits have to be sext of immediate.

	return DAG
	.getConstant(
	(int)C->getZExtValue() >> 2, SDLoc(Op),
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
	.getNode();
	}

	namespace {

	struct TailCallArgumentInfo {
	SDValue Arg;
	SDValue FrameIdxOp;
	int FrameIdx = 0;

	TailCallArgumentInfo() = default;
	};

	} // end anonymous namespace

	/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
	static void StoreTailCallArgumentsToStackSlot(
	SelectionDAG &DAG, SDValue Chain,
	const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
	SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
	for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
	SDValue Arg = TailCallArgs[i].Arg;
	SDValue FIN = TailCallArgs[i].FrameIdxOp;
	int FI = TailCallArgs[i].FrameIdx;
	// Store relative to framepointer.
	MemOpChains.push_back(DAG.getStore(
	Chain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
	/// the appropriate stack slot for the tail call optimized function call.
	static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
	SDValue OldRetAddr, SDValue OldFP,
	int SPDiff, const SDLoc &dl) {
	if (SPDiff) {
	// Calculate the new stack slot for the return address.
	MachineFunction &MF = DAG.getMachineFunction();
	const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
	const PPCFrameLowering *FL = Subtarget.getFrameLowering();
	bool isPPC64 = Subtarget.isPPC64();
	int SlotSize = isPPC64 ? 8 : 4;
	int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
	int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
	NewRetAddrLoc, true);
	EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
	Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(MF, NewRetAddr));

	// When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
	// slot as the FP is never overwritten.
	if (Subtarget.isDarwinABI()) {
	int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
	int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
	true);
	SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
	Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewFPIdx));
	}
	}
	return Chain;
	}

	/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
	/// the position of the argument.
	static void
	CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
	SDValue Arg, int SPDiff, unsigned ArgOffset,
	SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
	int Offset = ArgOffset + SPDiff;
	uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
	SDValue FIN = DAG.getFrameIndex(FI, VT);
	TailCallArgumentInfo Info;
	Info.Arg = Arg;
	Info.FrameIdxOp = FIN;
	Info.FrameIdx = FI;
	TailCallArguments.push_back(Info);
	}

	/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
	/// stack slot. Returns the chain as result and the loaded frame pointers in
	/// LROpOut/FPOpout. Used when tail calling.
	SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
	SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
	SDValue &FPOpOut, const SDLoc &dl) const {
	if (SPDiff) {
	// Load the LR and FP stack slot for later adjusting.
	EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
	LROpOut = getReturnAddrFrameIndex(DAG);
	LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
	Chain = SDValue(LROpOut.getNode(), 1);

	// When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
	// slot as the FP is never overwritten.
	if (Subtarget.isDarwinABI()) {
	FPOpOut = getFramePointerFrameIndex(DAG);
	FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
	Chain = SDValue(FPOpOut.getNode(), 1);
	}
	}
	return Chain;
	}

	/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
	/// by "Src" to address "Dst" of size "Size". Alignment information is
	/// specified by the specific parameter attribute. The copy will be passed as
	/// a byval function parameter.
	/// Sometimes what we are copying is the end of a larger object, the part that
	/// does not fit in registers.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	false, false, false, MachinePointerInfo(),
	MachinePointerInfo());
	}

	/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
	/// tail calls.
	static void LowerMemOpCallTo(
	SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
	SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
	bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
	SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
	EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	if (!isTailCall) {
	if (isVector) {
	SDValue StackPtr;
	if (isPPC64)
	StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
	else
	StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
	DAG.getConstant(ArgOffset, dl, PtrVT));
	}
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	// Calculate and remember argument location.
	} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
	TailCallArguments);
	}

	static void
	PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
	const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
	SDValue FPOp,
	SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
	// Emit a sequence of copyto/copyfrom virtual registers for arguments that
	// might overwrite each other in case of tail call optimization.
	SmallVector<SDValue, 8> MemOpChains2;
	// Do not flag preceding copytoreg stuff together with the following stuff.
	InFlag = SDValue();
	StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
	MemOpChains2, dl);
	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);

	// Emit callseq_end just before tailcall node.
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Is this global address that of a function that can be called by name? (as
	// opposed to something that must hold a descriptor for an indirect call).
	static bool isFunctionGlobalAddress(SDValue Callee) {
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	if (Callee.getOpcode() == ISD::GlobalTLSAddress \|\|
	Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;

	return G->getGlobal()->getValueType()->isFunctionTy();
	}

	return false;
	}

	static unsigned
	PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
	SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall,
	bool isPatchPoint, bool hasNest,
	SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
	SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
	ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
	bool isPPC64 = Subtarget.isPPC64();
	bool isSVR4ABI = Subtarget.isSVR4ABI();
	bool isELFv2ABI = Subtarget.isELFv2ABI();

	EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	NodeTys.push_back(MVT::Other); // Returns a chain
	NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use.

	unsigned CallOpc = PPCISD::CALL;

	bool needIndirectCall = true;
	if (!isSVR4ABI \|\| !isPPC64)
	if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
	// If this is an absolute destination address, use the munged value.
	Callee = SDValue(Dest, 0);
	needIndirectCall = false;
	}

	// PC-relative references to external symbols should go through $stub, unless
	// we're building with the leopard linker or later, which automatically
	// synthesizes these stubs.
	const TargetMachine &TM = DAG.getTarget();
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	const GlobalValue *GV = nullptr;
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
	GV = G->getGlobal();
	bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
	bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;

	if (isFunctionGlobalAddress(Callee)) {
	GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
	// A call to a TLS address is actually an indirect call to a
	// thread-specific pointer.
	unsigned OpFlags = 0;
	if (UsePlt)
	OpFlags = PPCII::MO_PLT;

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common,
	// every direct call is) turn it into a TargetGlobalAddress /
	// TargetExternalSymbol node so that legalize doesn't hack it.
	Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
	Callee.getValueType(), 0, OpFlags);
	needIndirectCall = false;
	}

	if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	unsigned char OpFlags = 0;

	if (UsePlt)
	OpFlags = PPCII::MO_PLT;

	Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
	OpFlags);
	needIndirectCall = false;
	}

	if (isPatchPoint) {
	// We'll form an invalid direct call when lowering a patchpoint; the full
	// sequence for an indirect call is complicated, and many of the
	// instructions introduced might have side effects (and, thus, can't be
	// removed later). The call itself will be removed as soon as the
	// argument/return lowering is complete, so the fact that it has the wrong
	// kind of operands should not really matter.
	needIndirectCall = false;
	}

	if (needIndirectCall) {
	// Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair
	// to do the call, we can't use PPCISD::CALL.
	SDValue MTCTROps[] = {Chain, Callee, InFlag};

	if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
	// Function pointers in the 64-bit SVR4 ABI do not point to the function
	// entry point, but to the function descriptor (the function entry point
	// address is part of the function descriptor though).
	// The function descriptor is a three doubleword structure with the
	// following fields: function entry point, TOC base address and
	// environment pointer.
	// Thus for a call through a function pointer, the following actions need
	// to be performed:
	// 1. Save the TOC of the caller in the TOC save area of its stack
	// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
	// 2. Load the address of the function entry point from the function
	// descriptor.
	// 3. Load the TOC of the callee from the function descriptor into r2.
	// 4. Load the environment pointer from the function descriptor into
	// r11.
	// 5. Branch to the function entry point address.
	// 6. On return of the callee, the TOC of the caller needs to be
	// restored (this is done in FinishCall()).
	//
	// The loads are scheduled at the beginning of the call sequence, and the
	// register copies are flagged together to ensure that no other
	// operations can be scheduled in between. E.g. without flagging the
	// copies together, a TOC access in the caller could be scheduled between
	// the assignment of the callee TOC and the branch to the callee, which
	// results in the TOC access going through the TOC of the callee instead
	// of going through the TOC of the caller, which leads to incorrect code.

	// Load the address of the function entry point from the function
	// descriptor.
	SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1);
	if (LDChain.getValueType() == MVT::Glue)
	LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2);

	auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
	? (MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant)
	: MachineMemOperand::MONone;

	MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
	SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
	/* Alignment = */ 8, MMOFlags);

	// Load environment pointer into r11.
	SDValue PtrOff = DAG.getIntPtrConstant(16, dl);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
	SDValue LoadEnvPtr =
	DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16),
	/* Alignment = */ 8, MMOFlags);

	SDValue TOCOff = DAG.getIntPtrConstant(8, dl);
	SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
	SDValue TOCPtr =
	DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8),
	/* Alignment = */ 8, MMOFlags);

	setUsesTOCBasePtr(DAG);
	SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr,
	InFlag);
	Chain = TOCVal.getValue(0);
	InFlag = TOCVal.getValue(1);

	// If the function call has an explicit 'nest' parameter, it takes the
	// place of the environment pointer.
	if (!hasNest) {
	SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
	InFlag);

	Chain = EnvVal.getValue(0);
	InFlag = EnvVal.getValue(1);
	}

	MTCTROps[0] = Chain;
	MTCTROps[1] = LoadFuncPtr;
	MTCTROps[2] = InFlag;
	}

	Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
	makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
	InFlag = Chain.getValue(1);

	NodeTys.clear();
	NodeTys.push_back(MVT::Other);
	NodeTys.push_back(MVT::Glue);
	Ops.push_back(Chain);
	CallOpc = PPCISD::BCTRL;
	Callee.setNode(nullptr);
	// Add use of X11 (holding environment pointer)
	if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
	Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
	// Add CTR register as callee so a bctr can be emitted later.
	if (isTailCall)
	Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
	}

	// If this is a direct call, pass the chain and the callee.
	if (Callee.getNode()) {
	Ops.push_back(Chain);
	Ops.push_back(Callee);
	}
	// If this is a tail call add stack pointer delta.
	if (isTailCall)
	Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
	// into the call.
	if (isSVR4ABI && isPPC64 && !isPatchPoint) {
	setUsesTOCBasePtr(DAG);
	Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
	}

	return CallOpc;
	}

	SDValue PPCTargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Val = DAG.getCopyFromReg(Chain, dl,
	VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	SDValue PPCTargetLowering::FinishCall(
	CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
	bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
	SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
	unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
	SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
	std::vector<EVT> NodeTys;
	SmallVector<SDValue, 8> Ops;
	unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
	SPDiff, isTailCall, isPatchPoint, hasNest,
	RegsToPass, Ops, NodeTys, CS, Subtarget);

	// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
	if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
	Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));

	// When performing tail call optimization the callee pops its arguments off
	// the stack. Account for this here so these bytes can be pushed back on in
	// PPCFrameLowering::eliminateCallFramePseudoInstr.
	int BytesCalleePops =
	(CallConv == CallingConv::Fast &&
	getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask =
	TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	// Emit tail call.
	if (isTailCall) {
	assert(((Callee.getOpcode() == ISD::Register &&
	cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) \|\|
	Callee.getOpcode() == ISD::TargetExternalSymbol \|\|
	Callee.getOpcode() == ISD::TargetGlobalAddress \|\|
	isa<ConstantSDNode>(Callee)) &&
	"Expecting an global address, external symbol, absolute value or register");

	DAG.getMachineFunction().getFrameInfo().setHasTailCall();
	return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
	}

	// Add a NOP immediately after the branch instruction when using the 64-bit
	// SVR4 ABI. At link time, if caller and callee are in a different module and
	// thus have a different TOC, the call will be replaced with a call to a stub
	// function which saves the current TOC, loads the TOC of the callee and
	// branches to the callee. The NOP will be replaced with a load instruction
	// which restores the TOC of the caller from the TOC save slot of the current
	// stack frame. If caller and callee belong to the same module (and have the
	// same TOC), the NOP will remain unchanged.

	MachineFunction &MF = DAG.getMachineFunction();
	if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
	!isPatchPoint) {
	if (CallOpc == PPCISD::BCTRL) {
	// This is a call through a function pointer.
	// Restore the caller TOC from the save area into R2.
	// See PrepareCall() for more information about calls through function
	// pointers in the 64-bit SVR4 ABI.
	// We are using a target-specific load with r2 hard coded, because the
	// result of a target-independent load would never go directly into r2,
	// since r2 is a reserved register (which prevents the register allocator
	// from allocating it), resulting in an additional register being
	// allocated and an unnecessary move instruction being generated.
	CallOpc = PPCISD::BCTRL_LOAD_TOC;

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);

	// The address needs to go after the chain input but before the flag (or
	// any other variadic arguments).
	Ops.insert(std::next(Ops.begin()), AddTOC);
	} else if (CallOpc == PPCISD::CALL &&
	!callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) {
	// Otherwise insert NOP for non-local calls.
	CallOpc = PPCISD::CALL_NOP;
	}
	}

	Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(BytesCalleePops, dl, true),
	InFlag, dl);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
	Ins, dl, DAG, InVals);
	}

	SDValue
	PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &isTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool isVarArg = CLI.IsVarArg;
	bool isPatchPoint = CLI.IsPatchPoint;
	ImmutableCallSite CS = CLI.CS;

	if (isTailCall) {
	if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
	isTailCall = false;
	else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
	isTailCall =
	IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
	isVarArg, Outs, Ins, DAG);
	else
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
	Ins, DAG);
	if (isTailCall) {
	++NumTailCalls;
	if (!getTargetMachine().Options.GuaranteedTailCallOpt)
	++NumSiblingCalls;

	assert(isa<GlobalAddressSDNode>(Callee) &&
	"Callee should be an llvm::Function object.");
	DEBUG(
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
	const unsigned Width = 80 - strlen("TCO caller: ")
	- strlen(", callee linkage: 0, 0");
	dbgs() << "TCO caller: "
	<< left_justify(DAG.getMachineFunction().getName(), Width)
	<< ", callee linkage: "
	<< GV->getVisibility() << ", " << GV->getLinkage() << "\n"
	);
	}
	}

	if (!isTailCall && CS && CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// When long calls (i.e. indirect calls) are always used, calls are always
	// made via function pointer. If we have a function name, first translate it
	// into a pointer.
	if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
	!isTailCall)
	Callee = LowerGlobalAddress(Callee, DAG);

	if (Subtarget.isSVR4ABI()) {
	if (Subtarget.isPPC64())
	return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);
	else
	return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);
	}

	return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
	isTailCall, isPatchPoint, Outs, OutVals, Ins,
	dl, DAG, InVals, CS);
	}

	SDValue PPCTargetLowering::LowerCall_32SVR4(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {
	// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
	// of the 32-bit SVR4 ABI stack frame layout.

	assert((CallConv == CallingConv::C \|\|
	CallConv == CallingConv::Fast) && "Unknown calling convention!");

	unsigned PtrByteSize = 4;

	MachineFunction &MF = DAG.getMachineFunction();

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, parameter list area and the part of the local variable space which
	// contains copies of aggregates which are passed by value.

	// Assign locations to all of the outgoing arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Reserve space for the linkage area on the stack.
	CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
	PtrByteSize);
	if (useSoftFloat())
	CCInfo.PreAnalyzeCallOperands(Outs);

	if (isVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Fixed vector arguments go into registers as long as registers are
	// available. Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	bool Result;

	if (Outs[i].IsFixed) {
	Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
	CCInfo);
	} else {
	Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo);
	}

	if (Result) {
	#ifndef NDEBUG
	errs() << "Call operand #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << "\n";
	#endif
	llvm_unreachable(nullptr);
	}
	}
	} else {
	// All arguments are treated the same.
	CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
	}
	CCInfo.clearWasPPCF128();

	// Assign locations to all of the outgoing aggregate by value arguments.
	SmallVector<CCValAssign, 16> ByValArgLocs;
	CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());

	// Reserve stack space for the allocations in CCInfo.
	CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);

	CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);

	// Size of the linkage area, parameter list area and the part of the local
	// space variable where copies of aggregates which are passed by value are
	// stored.
	unsigned NumBytes = CCByValInfo.getNextStackOffset();

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be moved somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
	SmallVector<SDValue, 8> MemOpChains;

	bool seenFloatArg = false;
	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, j = 0, e = ArgLocs.size();
	i != e;
	++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	if (Flags.isByVal()) {
	// Argument is an aggregate which is passed by value, thus we need to
	// create a copy of it in the local variable space of the current stack
	// frame (which is the stack frame of the caller) and pass the address of
	// this copy to the callee.
	assert((j < ByValArgLocs.size()) && "Index out of bounds!");
	CCValAssign &ByValVA = ByValArgLocs[j++];
	assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");

	// Memory reserved in the local variable space of the callers stack frame.
	unsigned LocMemOffset = ByValVA.getLocMemOffset();

	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
	StackPtr, PtrOff);

	// Create a copy of the argument in the local area of the current
	// stack frame.
	SDValue MemcpyCall =
	CreateCopyOfByValArgument(Arg, PtrOff,
	CallSeqStart.getNode()->getOperand(0),
	Flags, DAG, dl);

	// This must go outside the CALLSEQ_START..END.
	SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
	SDLoc(MemcpyCall));
	DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
	NewCallSeqStart.getNode());
	Chain = CallSeqStart = NewCallSeqStart;

	// Pass the address of the aggregate copy on the stack either in a
	// physical register or in the parameter list area of the current stack
	// frame to the callee.
	Arg = PtrOff;
	}

	if (VA.isRegLoc()) {
	if (Arg.getValueType() == MVT::i1)
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);

	seenFloatArg \|= VA.getLocVT().isFloatingPoint();
	// Put argument in a physical register.
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	// Put argument in the parameter list area of the current stack frame.
	assert(VA.isMemLoc());
	unsigned LocMemOffset = VA.getLocMemOffset();

	if (!isTailCall) {
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
	StackPtr, PtrOff);

	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	} else {
	// Calculate and remember argument location.
	CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
	TailCallArguments);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// Set CR bit 6 to true if this is a vararg call with floating args passed in
	// registers.
	if (isVarArg) {
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, InFlag };

	Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
	dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));

	InFlag = Chain.getValue(1);
	}

	if (isTailCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
	/* unused except on PPC64 ELFv1 */ false, DAG,
	RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
	NumBytes, Ins, InVals, CS);
	}

	// Copy an argument into memory, being careful to do this outside the
	// call sequence for the call to which the argument belongs.
	SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
	SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) const {
	SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
	CallSeqStart.getNode()->getOperand(0),
	Flags, DAG, dl);
	// The MEMCPY must go outside the CALLSEQ_START..END.
	int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
	SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
	SDLoc(MemcpyCall));
	DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
	NewCallSeqStart.getNode());
	return NewCallSeqStart;
	}

	SDValue PPCTargetLowering::LowerCall_64SVR4(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {
	bool isELFv2ABI = Subtarget.isELFv2ABI();
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned NumOps = Outs.size();
	bool hasNest = false;
	bool IsSibCall = false;

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	unsigned PtrByteSize = 8;

	MachineFunction &MF = DAG.getMachineFunction();

	if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
	IsSibCall = true;

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	assert(!(CallConv == CallingConv::Fast && isVarArg) &&
	"fastcc not supported on varargs functions");

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
	// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
	// area is 32 bytes reserved space for [SP][CR][LR][TOC].
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned NumBytes = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
	unsigned &QFPR_idx = FPR_idx;

	static const MCPhysReg GPR[] = {
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};

	const unsigned NumGPRs = array_lengthof(GPR);
	const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
	const unsigned NumVRs = array_lengthof(VR);
	const unsigned NumQFPRs = NumFPRs;

	// On ELFv2, we can avoid allocating the parameter area if all the arguments
	// can be passed to the callee in registers.
	// For the fast calling convention, there is another check below.
	// Note: We should keep consistent with LowerFormalArguments_64SVR4()
	bool HasParameterArea = !isELFv2ABI \|\| isVarArg \|\| CallConv == CallingConv::Fast;
	if (!HasParameterArea) {
	unsigned ParamAreaSize = NumGPRs * PtrByteSize;
	unsigned AvailableFPRs = NumFPRs;
	unsigned AvailableVRs = NumVRs;
	unsigned NumBytesTmp = NumBytes;
	for (unsigned i = 0; i != NumOps; ++i) {
	if (Outs[i].Flags.isNest()) continue;
	if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
	PtrByteSize, LinkageSize, ParamAreaSize,
	NumBytesTmp, AvailableFPRs, AvailableVRs,
	Subtarget.hasQPX()))
	HasParameterArea = true;
	}
	}

	// When using the fast calling convention, we don't provide backing for
	// arguments that will be in registers.
	unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;

	// Add up all the space actually used.
	for (unsigned i = 0; i != NumOps; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	EVT OrigVT = Outs[i].ArgVT;

	if (Flags.isNest())
	continue;

	if (CallConv == CallingConv::Fast) {
	if (Flags.isByVal())
	NumGPRsUsed += (Flags.getByValSize()+7)/8;
	else
	switch (ArgVT.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (++NumGPRsUsed <= NumGPRs)
	continue;
	break;
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	if (++NumVRsUsed <= NumVRs)
	continue;
	break;
	case MVT::v4f32:
	// When using QPX, this is handled like a FP register, otherwise, it
	// is an Altivec register.
	if (Subtarget.hasQPX()) {
	if (++NumFPRsUsed <= NumFPRs)
	continue;
	} else {
	if (++NumVRsUsed <= NumVRs)
	continue;
	}
	break;
	case MVT::f32:
	case MVT::f64:
	case MVT::v4f64: // QPX
	case MVT::v4i1: // QPX
	if (++NumFPRsUsed <= NumFPRs)
	continue;
	break;
	}
	}

	/* Respect alignment of argument on the stack. */
	unsigned Align =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	NumBytes = ((NumBytes + Align - 1) / Align) * Align;

	NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	if (Flags.isInConsecutiveRegsLast())
	NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}

	unsigned NumBytesActuallyUsed = NumBytes;

	// In the old ELFv1 ABI,
	// the prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if its varargs.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	// In the ELFv2 ABI, we allocate the parameter area iff a callee
	// really requires memory operands, e.g. a vararg function.
	if (HasParameterArea)
	NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
	else
	NumBytes = LinkageSize;

	// Tail call needs the stack to be aligned.
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

	int SPDiff = 0;

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	if (!IsSibCall)
	SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

	// To protect arguments on the stack from being clobbered in a tail call,
	// force all the loads to happen before doing any other lowering.
	if (isTailCall)
	Chain = DAG.getStackArgumentTokenFactor(Chain);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be move somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);

	// Figure out which arguments are going to go in registers, and which in
	// memory. Also, if this is a vararg function, floating point operations
	// must be stored to our stack, and loaded into integer regs as well, if
	// any integer regs are available for argument passing.
	unsigned ArgOffset = LinkageSize;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;

	SmallVector<SDValue, 8> MemOpChains;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	EVT OrigVT = Outs[i].ArgVT;

	// PtrOff will be used to store the current argument to the stack if a
	// register cannot be found for it.
	SDValue PtrOff;

	// We re-align the argument offset for each argument, except when using the
	// fast calling convention, when we need to make sure we do that only when
	// we'll actually use a stack slot.
	auto ComputePtrOff = [&]() {
	/* Respect alignment of argument on the stack. */
	unsigned Align =
	CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
	ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;

	PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	};

	if (CallConv != CallingConv::Fast) {
	ComputePtrOff();

	/* Compute GPR index associated with argument offset. */
	GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
	GPR_idx = std::min(GPR_idx, NumGPRs);
	}

	// Promote integers to 64-bit values.
	if (Arg.getValueType() == MVT::i32 \|\| Arg.getValueType() == MVT::i1) {
	// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
	unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
	}

	// FIXME memcpy is used way more than necessary. Correctness first.
	// Note: "by value" is code for passing a structure by value, not
	// basic types.
	if (Flags.isByVal()) {
	// Note: Size includes alignment padding, so
	// struct x { short a; char b; }
	// will have Size = 4. With #pragma pack(1), it will have Size = 3.
	// These are the proper values we need for right-justifying the
	// aggregate in a parameter register.
	unsigned Size = Flags.getByValSize();

	// An empty aggregate parameter takes up no storage and no
	// registers.
	if (Size == 0)
	continue;

	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	// All aggregates smaller than 8 bytes must be passed right-justified.
	if (Size==1 \|\| Size==2 \|\| Size==4) {
	EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
	if (GPR_idx != NumGPRs) {
	SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
	MachinePointerInfo(), VT);
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	ArgOffset += PtrByteSize;
	continue;
	}
	}

	if (GPR_idx == NumGPRs && Size < 8) {
	SDValue AddPtr = PtrOff;
	if (!isLittleEndian) {
	SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
	PtrOff.getValueType());
	AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	}
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);
	ArgOffset += PtrByteSize;
	continue;
	}
	// Copy entire object into memory. There are cases where gcc-generated
	// code assumes it is there, even if it could be put entirely into
	// registers. (This is not what the doc says.)

	// FIXME: The above statement is likely due to a misunderstanding of the
	// documents. All arguments must be copied into the parameter area BY
	// THE CALLEE in the event that the callee takes the address of any
	// formal argument. That has not yet been implemented. However, it is
	// reasonable to use the stack area as a staging area for the register
	// load.

	// Skip this for small aggregates, as we will use the same slot for a
	// right-justified copy, below.
	if (Size >= 8)
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
	CallSeqStart,
	Flags, DAG, dl);

	// When a register is available, pass a small aggregate right-justified.
	if (Size < 8 && GPR_idx != NumGPRs) {
	// The easiest way to get this right-justified in a register
	// is to copy the structure into the rightmost portion of a
	// local variable slot, then load the whole slot into the
	// register.
	// FIXME: The memcpy seems to produce pretty awful code for
	// small aggregates, particularly for packed ones.
	// FIXME: It would be preferable to use the slot in the
	// parameter save area instead of a new local variable.
	SDValue AddPtr = PtrOff;
	if (!isLittleEndian) {
	SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
	AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	}
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);

	// Load the slot into the register.
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	// Done with this argument.
	ArgOffset += PtrByteSize;
	continue;
	}

	// For aggregates larger than PtrByteSize, copy the pieces of the
	// object that fit into registers from the parameter save area.
	for (unsigned j=0; j<Size; j+=PtrByteSize) {
	SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
	break;
	}
	}
	continue;
	}

	switch (Arg.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (Flags.isNest()) {
	// The 'nest' parameter, if any, is passed in R11.
	RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
	hasNest = true;
	break;
	}

	// These can be scalar arguments or elements of an integer array type
	// passed directly. Clang may use those instead of "byval" aggregate
	// types to avoid forcing arguments to memory unnecessarily.
	if (GPR_idx != NumGPRs) {
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, false, MemOpChains,
	TailCallArguments, dl);
	if (CallConv == CallingConv::Fast)
	ArgOffset += PtrByteSize;
	}
	if (CallConv != CallingConv::Fast)
	ArgOffset += PtrByteSize;
	break;
	case MVT::f32:
	case MVT::f64: {
	// These can be scalar arguments or elements of a float array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// float aggregates.

	// Named arguments go into FPRs first, and once they overflow, the
	// remaining arguments go into GPRs and then the parameter save area.
	// Unnamed arguments for vararg functions always go to GPRs and
	// then the parameter save area. For now, put all arguments to vararg
	// routines always in both locations (FPR and GPR or stack slot).
	bool NeedGPROrStack = isVarArg \|\| FPR_idx == NumFPRs;
	bool NeededLoad = false;

	// First load the argument into the next available FPR.
	if (FPR_idx != NumFPRs)
	RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

	// Next, load the argument into GPR or stack slot if needed.
	if (!NeedGPROrStack)
	;
	else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
	// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
	// once we support fp <-> gpr moves.

	// In the non-vararg case, this can only ever happen in the
	// presence of f32 array types, since otherwise we never run
	// out of FPRs before running out of GPRs.
	SDValue ArgVal;

	// Double values are always passed in a single GPR.
	if (Arg.getValueType() != MVT::f32) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);

	// Non-array float values are extended and passed in a GPR.
	} else if (!Flags.isInConsecutiveRegs()) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);

	// If we have an array of floats, we collect every odd element
	// together with its predecessor into one GPR.
	} else if (ArgOffset % PtrByteSize != 0) {
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	if (!isLittleEndian)
	std::swap(Lo, Hi);
	ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

	// The final element, if even, goes into the first half of a GPR.
	} else if (Flags.isInConsecutiveRegsLast()) {
	ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
	if (!isLittleEndian)
	ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
	DAG.getConstant(32, dl, MVT::i32));

	// Non-final even elements are skipped; they will be handled
	// together the with subsequent argument on the next go-around.
	} else
	ArgVal = SDValue();

	if (ArgVal.getNode())
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	// Single-precision floating-point values are mapped to the
	// second (rightmost) word of the stack doubleword.
	if (Arg.getValueType() == MVT::f32 &&
	!isLittleEndian && !Flags.isInConsecutiveRegs()) {
	SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
	}

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, false, MemOpChains,
	TailCallArguments, dl);

	NeededLoad = true;
	}
	// When passing an array of floats, the array occupies consecutive
	// space in the argument area; only round up to the next doubleword
	// at the end of the array. Otherwise, each float takes 8 bytes.
	if (CallConv != CallingConv::Fast \|\| NeededLoad) {
	ArgOffset += (Arg.getValueType() == MVT::f32 &&
	Flags.isInConsecutiveRegs()) ? 4 : 8;
	if (Flags.isInConsecutiveRegsLast())
	ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
	}
	break;
	}
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v1i128:
	if (!Subtarget.hasQPX()) {
	// These can be scalar arguments or elements of a vector array type
	// passed directly. The latter are used to implement ELFv2 homogenous
	// vector aggregates.

	// For a varargs call, named arguments go into VRs or on the stack as
	// usual; unnamed arguments always go to the stack or the corresponding
	// GPRs when within range. For now, we always put the value in both
	// locations (or even all three).
	if (isVarArg) {
	assert(HasParameterArea &&
	"Parameter area must exist if we have a varargs call.");
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (VR_idx != NumVRs) {
	SDValue Load =
	DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
	}
	ArgOffset += 16;
	for (unsigned i=0; i<16; i+=PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs Altivec params go into VRs or on the stack.
	if (VR_idx != NumVRs) {
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	if (CallConv == CallingConv::Fast)
	ArgOffset += 16;
	}

	if (CallConv != CallingConv::Fast)
	ArgOffset += 16;
	break;
	} // not QPX

	assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
	"Invalid QPX parameter type");

	/* fall through */
	case MVT::v4f64:
	case MVT::v4i1: {
	bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
	if (isVarArg) {
	assert(HasParameterArea &&
	"Parameter area must exist if we have a varargs call.");
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (QFPR_idx != NumQFPRs) {
	SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
	PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
	}
	ArgOffset += (IsF32 ? 16 : 32);
	for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs QPX params go into registers or on the stack.
	if (QFPR_idx != NumQFPRs) {
	RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
	} else {
	if (CallConv == CallingConv::Fast)
	ComputePtrOff();

	assert(HasParameterArea &&
	"Parameter area must exist to pass an argument in memory.");
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	true, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	if (CallConv == CallingConv::Fast)
	ArgOffset += (IsF32 ? 16 : 32);
	}

	if (CallConv != CallingConv::Fast)
	ArgOffset += (IsF32 ? 16 : 32);
	break;
	}
	}
	}

	assert((!HasParameterArea \|\| NumBytesActuallyUsed == ArgOffset) &&
	"mismatch in size of parameter area");
	(void)NumBytesActuallyUsed;

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Check if this is an indirect call (MTCTR/BCTRL).
	// See PrepareCall() for more information about calls through function
	// pointers in the 64-bit SVR4 ABI.
	if (!isTailCall && !isPatchPoint &&
	!isFunctionGlobalAddress(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) {
	// Load r2 into a virtual register and store it to the TOC save area.
	setUsesTOCBasePtr(DAG);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
	// TOC save area offset.
	unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
	Chain = DAG.getStore(
	Val.getValue(1), dl, Val, AddPtr,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
	// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
	// This does not mean the MTCTR instruction must use R12; it's easier
	// to model this as an extra parameter, so do that.
	if (isELFv2ABI && !isPatchPoint)
	RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (isTailCall && !IsSibCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
	DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
	SPDiff, NumBytes, Ins, InVals, CS);
	}

	SDValue PPCTargetLowering::LowerCall_Darwin(
	SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const {
	unsigned NumOps = Outs.size();

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;
	unsigned PtrByteSize = isPPC64 ? 8 : 4;

	MachineFunction &MF = DAG.getMachineFunction();

	// Mark this function as potentially containing a function that contains a
	// tail call. As a consequence the frame pointer will be used for dynamicalloc
	// and restoring the callers stack pointer in this functions epilog. This is
	// done because by tail calling the called function might overwrite the value
	// in this function's (MF) stack pointer stack slot 0(SP).
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

	// Count how many bytes are to be pushed on the stack, including the linkage
	// area, and parameter passing area. We start with 24/48 bytes, which is
	// prereserved space for [SP][CR][LR][3 x unused].
	unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
	unsigned NumBytes = LinkageSize;

	// Add up all the space actually used.
	// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
	// they all go in registers, but we must reserve stack space for them for
	// possible use by the caller. In varargs or 64-bit calls, parameters are
	// assigned stack space in order, with padding so Altivec parameters are
	// 16-byte aligned.
	unsigned nAltivecParamsAtEnd = 0;
	for (unsigned i = 0; i != NumOps; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	EVT ArgVT = Outs[i].VT;
	// Varargs Altivec parameters are padded to a 16 byte boundary.
	if (ArgVT == MVT::v4f32 \|\| ArgVT == MVT::v4i32 \|\|
	ArgVT == MVT::v8i16 \|\| ArgVT == MVT::v16i8 \|\|
	ArgVT == MVT::v2f64 \|\| ArgVT == MVT::v2i64) {
	if (!isVarArg && !isPPC64) {
	// Non-varargs Altivec parameters go after all the non-Altivec
	// parameters; handle those later so we know how much padding we need.
	nAltivecParamsAtEnd++;
	continue;
	}
	// Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
	NumBytes = ((NumBytes+15)/16)*16;
	}
	NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
	}

	// Allow for Altivec parameters at the end, if needed.
	if (nAltivecParamsAtEnd) {
	NumBytes = ((NumBytes+15)/16)*16;
	NumBytes += 16*nAltivecParamsAtEnd;
	}

	// The prolog code of the callee may store up to 8 GPR argument registers to
	// the stack, allowing va_start to index over them in memory if its varargs.
	// Because we cannot tell if this is needed on the caller side, we have to
	// conservatively assume that it is needed. As such, make sure we have at
	// least enough stack space for the caller to store the 8 GPRs.
	NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);

	// Tail call needs the stack to be aligned.
	if (getTargetMachine().Options.GuaranteedTailCallOpt &&
	CallConv == CallingConv::Fast)
	NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

	// Calculate by how many bytes the stack has to be adjusted in case of tail
	// call optimization.
	int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

	// To protect arguments on the stack from being clobbered in a tail call,
	// force all the loads to happen before doing any other lowering.
	if (isTailCall)
	Chain = DAG.getStackArgumentTokenFactor(Chain);

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	SDValue CallSeqStart = Chain;

	// Load the return address and frame pointer so it can be move somewhere else
	// later.
	SDValue LROp, FPOp;
	Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);

	// Set up a copy of the stack pointer for use loading and storing any
	// arguments that may not fit in the registers available for argument
	// passing.
	SDValue StackPtr;
	if (isPPC64)
	StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
	else
	StackPtr = DAG.getRegister(PPC::R1, MVT::i32);

	// Figure out which arguments are going to go in registers, and which in
	// memory. Also, if this is a vararg function, floating point operations
	// must be stored to our stack, and loaded into integer regs as well, if
	// any integer regs are available for argument passing.
	unsigned ArgOffset = LinkageSize;
	unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;

	static const MCPhysReg GPR_32[] = { // 32-bit registers.
	PPC::R3, PPC::R4, PPC::R5, PPC::R6,
	PPC::R7, PPC::R8, PPC::R9, PPC::R10,
	};
	static const MCPhysReg GPR_64[] = { // 64-bit registers.
	PPC::X3, PPC::X4, PPC::X5, PPC::X6,
	PPC::X7, PPC::X8, PPC::X9, PPC::X10,
	};
	static const MCPhysReg VR[] = {
	PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
	PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
	};
	const unsigned NumGPRs = array_lengthof(GPR_32);
	const unsigned NumFPRs = 13;
	const unsigned NumVRs = array_lengthof(VR);

	const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<TailCallArgumentInfo, 8> TailCallArguments;

	SmallVector<SDValue, 8> MemOpChains;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// PtrOff will be used to store the current argument to the stack if a
	// register cannot be found for it.
	SDValue PtrOff;

	PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

	// On PPC64, promote integers to 64-bit values.
	if (isPPC64 && Arg.getValueType() == MVT::i32) {
	// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
	unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
	}

	// FIXME memcpy is used way more than necessary. Correctness first.
	// Note: "by value" is code for passing a structure by value, not
	// basic types.
	if (Flags.isByVal()) {
	unsigned Size = Flags.getByValSize();
	// Very small objects are passed right-justified. Everything else is
	// passed left-justified.
	if (Size==1 \|\| Size==2) {
	EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
	if (GPR_idx != NumGPRs) {
	SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
	MachinePointerInfo(), VT);
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));

	ArgOffset += PtrByteSize;
	} else {
	SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
	PtrOff.getValueType());
	SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
	CallSeqStart,
	Flags, DAG, dl);
	ArgOffset += PtrByteSize;
	}
	continue;
	}
	// Copy entire object into memory. There are cases where gcc-generated
	// code assumes it is there, even if it could be put entirely into
	// registers. (This is not what the doc says.)
	Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
	CallSeqStart,
	Flags, DAG, dl);

	// For small aggregates (Darwin only) and aggregates >= PtrByteSize,
	// copy the pieces of the object that fit into registers from the
	// parameter save area.
	for (unsigned j=0; j<Size; j+=PtrByteSize) {
	SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	ArgOffset += PtrByteSize;
	} else {
	ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
	break;
	}
	}
	continue;
	}

	switch (Arg.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unexpected ValueType for argument!");
	case MVT::i1:
	case MVT::i32:
	case MVT::i64:
	if (GPR_idx != NumGPRs) {
	if (Arg.getValueType() == MVT::i1)
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);

	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
	} else {
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, false, MemOpChains,
	TailCallArguments, dl);
	}
	ArgOffset += PtrByteSize;
	break;
	case MVT::f32:
	case MVT::f64:
	if (FPR_idx != NumFPRs) {
	RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

	if (isVarArg) {
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);

	// Float varargs are always shadowed in available integer registers
	if (GPR_idx != NumGPRs) {
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
	SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	} else {
	// If we have any FPRs remaining, we may also have GPRs remaining.
	// Args passed in FPRs consume either 1 (f32) or 2 (f64) available
	// GPRs.
	if (GPR_idx != NumGPRs)
	++GPR_idx;
	if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
	!isPPC64) // PPC64 has 64-bit GPR's obviously :)
	++GPR_idx;
	}
	} else
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, false, MemOpChains,
	TailCallArguments, dl);
	if (isPPC64)
	ArgOffset += 8;
	else
	ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	case MVT::v16i8:
	if (isVarArg) {
	// These go aligned on the stack, or in the corresponding R registers
	// when within range. The Darwin PPC ABI doc claims they also go in
	// V registers; in fact gcc does this only for arguments that are
	// prototyped, not for those that match the ... We do it for all
	// arguments, seems to work.
	while (ArgOffset % 16 !=0) {
	ArgOffset += PtrByteSize;
	if (GPR_idx != NumGPRs)
	GPR_idx++;
	}
	// We could elide this store in the case where the object fits
	// entirely in R registers. Maybe later.
	PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
	DAG.getConstant(ArgOffset, dl, PtrVT));
	SDValue Store =
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Store);
	if (VR_idx != NumVRs) {
	SDValue Load =
	DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
	}
	ArgOffset += 16;
	for (unsigned i=0; i<16; i+=PtrByteSize) {
	if (GPR_idx == NumGPRs)
	break;
	SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
	DAG.getConstant(i, dl, PtrVT));
	SDValue Load =
	DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
	}
	break;
	}

	// Non-varargs Altivec params generally go in registers, but have
	// stack space allocated at the end.
	if (VR_idx != NumVRs) {
	// Doesn't have GPR space allocated.
	RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
	} else if (nAltivecParamsAtEnd==0) {
	// We are emitting Altivec params in order.
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	ArgOffset += 16;
	}
	break;
	}
	}
	// If all Altivec parameters fit in registers, as they usually do,
	// they get stack space following the non-Altivec parameters. We
	// don't track this here because nobody below needs it.
	// If there are more Altivec parameters than fit in registers emit
	// the stores here.
	if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
	unsigned j = 0;
	// Offset is aligned; skip 1st 12 params which go in V registers.
	ArgOffset = ((ArgOffset+15)/16)*16;
	ArgOffset += 12*16;
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue Arg = OutVals[i];
	EVT ArgType = Outs[i].VT;
	if (ArgType==MVT::v4f32 \|\| ArgType==MVT::v4i32 \|\|
	ArgType==MVT::v8i16 \|\| ArgType==MVT::v16i8) {
	if (++j > NumVRs) {
	SDValue PtrOff;
	// We are emitting Altivec params in order.
	LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
	isPPC64, isTailCall, true, MemOpChains,
	TailCallArguments, dl);
	ArgOffset += 16;
	}
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// On Darwin, R12 must contain the address of an indirect callee. This does
	// not mean the MTCTR instruction must use R12; it's easier to model this as
	// an extra parameter, so do that.
	if (!isTailCall &&
	!isFunctionGlobalAddress(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee) &&
	!isBLACompatibleAddress(Callee, DAG))
	RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
	PPC::R12), Callee));

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (isTailCall)
	PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
	TailCallArguments);

	return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
	/* unused except on PPC64 ELFv1 */ false, DAG,
	RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
	NumBytes, Ins, InVals, CS);
	}

	bool
	PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_PPC);
	}

	SDValue
	PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_PPC);

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Arg = OutVals[i];

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	}

	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {

	if (PPC::G8RCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (PPC::F8RCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else if (PPC::CRRCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i1));
	else if (PPC::VRRCRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::Other));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
	}

	SDValue
	PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);

	// Get the correct type for integers.
	EVT IntVT = Op.getValueType();

	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue FPSIdx = getFramePointerFrameIndex(DAG);
	// Build a DYNAREAOFFSET node.
	SDValue Ops[2] = {Chain, FPSIdx};
	SDVTList VTs = DAG.getVTList(IntVT);
	return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
	}

	SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
	SelectionDAG &DAG) const {
	// When we pop the dynamic allocation we need to restore the SP link.
	SDLoc dl(Op);

	// Get the correct type for pointers.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// Construct the stack pointer operand.
	bool isPPC64 = Subtarget.isPPC64();
	unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
	SDValue StackPtr = DAG.getRegister(SP, PtrVT);

	// Get the operands for the STACKRESTORE.
	SDValue Chain = Op.getOperand(0);
	SDValue SaveSP = Op.getOperand(1);

	// Load the old link SP.
	SDValue LoadLinkSP =
	DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());

	// Restore the stack pointer.
	Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);

	// Store the old link SP.
	return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Get current frame pointer save index. The users of this index will be
	// primarily DYNALLOC instructions.
	PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
	int RASI = FI->getReturnAddrSaveIndex();

	// If the frame pointer save index hasn't been defined yet.
	if (!RASI) {
	// Find out what the fix offset of the frame pointer save area.
	int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
	// Allocate the frame index for frame pointer save area.
	RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
	// Save the result.
	FI->setReturnAddrSaveIndex(RASI);
	}
	return DAG.getFrameIndex(RASI, PtrVT);
	}

	SDValue
	PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Get current frame pointer save index. The users of this index will be
	// primarily DYNALLOC instructions.
	PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
	int FPSI = FI->getFramePointerSaveIndex();

	// If the frame pointer save index hasn't been defined yet.
	if (!FPSI) {
	// Find out what the fix offset of the frame pointer save area.
	int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
	// Allocate the frame index for frame pointer save area.
	FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
	// Save the result.
	FI->setFramePointerSaveIndex(FPSI);
	}
	return DAG.getFrameIndex(FPSI, PtrVT);
	}

	SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	SDLoc dl(Op);

	// Get the correct type for pointers.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	// Negate the size.
	SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
	DAG.getConstant(0, dl, PtrVT), Size);
	// Construct a node for the frame pointer save index.
	SDValue FPSIdx = getFramePointerFrameIndex(DAG);
	// Build a DYNALLOC node.
	SDValue Ops[3] = { Chain, NegSize, FPSIdx };
	SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
	return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
	}

	SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	bool isPPC64 = Subtarget.isPPC64();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorLoad(Op, DAG);

	assert(Op.getValueType() == MVT::i1 &&
	"Custom lowering only for i1 loads");

	// First, load 8 bits into 32 bits, then truncate to 1 bit.

	SDLoc dl(Op);
	LoadSDNode *LD = cast<LoadSDNode>(Op);

	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	MachineMemOperand *MMO = LD->getMemOperand();

	SDValue NewLD =
	DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
	BasePtr, MVT::i8, MMO);
	SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);

	SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getOperand(1).getValueType().isVector())
	return LowerVectorStore(Op, DAG);

	assert(Op.getOperand(1).getValueType() == MVT::i1 &&
	"Custom lowering only for i1 stores");

	// First, zero extend to 32 bits, then use a truncating store to 8 bits.

	SDLoc dl(Op);
	StoreSDNode *ST = cast<StoreSDNode>(Op);

	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	SDValue Value = ST->getValue();
	MachineMemOperand *MMO = ST->getMemOperand();

	Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
	Value);
	return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
	}

	// FIXME: Remove this once the ANDI glue bug is fixed:
	SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::i1 &&
	"Custom lowering only for i1 results");

	SDLoc DL(Op);
	return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
	Op.getOperand(0));
	}

	/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
	/// possible.
	SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
	// Not FP? Not a fsel.
	if (!Op.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Op.getOperand(2).getValueType().isFloatingPoint())
	return Op;

	// We might be able to do better than this under some circumstances, but in
	// general, fsel-based lowering of select is a finite-math-only optimization.
	// For more information, see section F.3 of the 2.06 ISA specification.
	if (!DAG.getTarget().Options.NoInfsFPMath \|\|
	!DAG.getTarget().Options.NoNaNsFPMath)
	return Op;
	// TODO: Propagate flags from the select rather than global settings.
	SDNodeFlags Flags;
	Flags.setNoInfs(true);
	Flags.setNoNaNs(true);

	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

	EVT ResVT = Op.getValueType();
	EVT CmpVT = Op.getOperand(0).getValueType();
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
	SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
	SDLoc dl(Op);

	// If the RHS of the comparison is a 0.0, we don't need to do the
	// subtraction at all.
	SDValue Sel1;
	if (isFloatingPointZero(RHS))
	switch (CC) {
	default: break; // SETUO etc aren't handled by fsel.
	case ISD::SETNE:
	std::swap(TV, FV);
	LLVM_FALLTHROUGH;
	case ISD::SETEQ:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
	if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
	Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
	case ISD::SETULT:
	case ISD::SETLT:
	std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
	LLVM_FALLTHROUGH;
	case ISD::SETOGE:
	case ISD::SETGE:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
	case ISD::SETUGT:
	case ISD::SETGT:
	std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
	LLVM_FALLTHROUGH;
	case ISD::SETOLE:
	case ISD::SETLE:
	if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
	}

	SDValue Cmp;
	switch (CC) {
	default: break; // SETUO etc aren't handled by fsel.
	case ISD::SETNE:
	std::swap(TV, FV);
	LLVM_FALLTHROUGH;
	case ISD::SETEQ:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
	Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT,
	DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
	case ISD::SETULT:
	case ISD::SETLT:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
	case ISD::SETOGE:
	case ISD::SETGE:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	case ISD::SETUGT:
	case ISD::SETGT:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
	case ISD::SETOLE:
	case ISD::SETLE:
	Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
	if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
	Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
	return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
	}
	return Op;
	}

	void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert(Op.getOperand(0).getValueType().isFloatingPoint());
	SDValue Src = Op.getOperand(0);
	if (Src.getValueType() == MVT::f32)
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);

	SDValue Tmp;
	switch (Op.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
	case MVT::i32:
	Tmp = DAG.getNode(
	Op.getOpcode() == ISD::FP_TO_SINT
	? PPCISD::FCTIWZ
	: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
	dl, MVT::f64, Src);
	break;
	case MVT::i64:
	assert((Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT()) &&
	"i64 FP_TO_UINT is supported only with FPCVT");
	Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ,
	dl, MVT::f64, Src);
	break;
	}

	// Convert the FP value to an int value through memory.
	bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
	(Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT());
	SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
	int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Emit a store to the stack slot.
	SDValue Chain;
	if (i32Stack) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
	SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
	Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
	DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
	} else
	Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);

	// Result is a load from the stack slot. If loading 4 bytes, make sure to
	// add in a bias on big endian.
	if (Op.getValueType() == MVT::i32 && !i32Stack) {
	FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
	DAG.getConstant(4, dl, FIPtr.getValueType()));
	MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
	}

	RLI.Chain = Chain;
	RLI.Ptr = FIPtr;
	RLI.MPI = MPI;
	}

	/// \brief Custom lowers floating point to integer conversions to use
	/// the direct move instructions available in ISA 2.07 to avoid the
	/// need for load/store combinations.
	SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert(Op.getOperand(0).getValueType().isFloatingPoint());
	SDValue Src = Op.getOperand(0);

	if (Src.getValueType() == MVT::f32)
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);

	SDValue Tmp;
	switch (Op.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
	case MVT::i32:
	Tmp = DAG.getNode(
	Op.getOpcode() == ISD::FP_TO_SINT
	? PPCISD::FCTIWZ
	: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
	dl, MVT::f64, Src);
	Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
	break;
	case MVT::i64:
	assert((Op.getOpcode() == ISD::FP_TO_SINT \|\| Subtarget.hasFPCVT()) &&
	"i64 FP_TO_UINT is supported only with FPCVT");
	Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ,
	dl, MVT::f64, Src);
	Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
	break;
	}
	return Tmp;
	}

	SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const {
	if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
	return LowerFP_TO_INTDirectMove(Op, DAG, dl);

	ReuseLoadInfo RLI;
	LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);

	return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
	RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
	}

	// We're trying to insert a regular store, S, and then a load, L. If the
	// incoming value, O, is a load, we might just be able to have our load use the
	// address used by O. However, we don't know if anything else will store to
	// that address before we can load from it. To prevent this situation, we need
	// to insert our load, L, into the chain as a peer of O. To do this, we give L
	// the same chain operand as O, we create a token factor from the chain results
	// of O and L, and we replace all uses of O's chain result with that token
	// factor (see spliceIntoChain below for this last part).
	bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
	ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	ISD::LoadExtType ET) const {
	SDLoc dl(Op);
	if (ET == ISD::NON_EXTLOAD &&
	(Op.getOpcode() == ISD::FP_TO_UINT \|\|
	Op.getOpcode() == ISD::FP_TO_SINT) &&
	isOperationLegalOrCustom(Op.getOpcode(),
	Op.getOperand(0).getValueType())) {

	LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
	return true;
	}

	LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
	if (!LD \|\| LD->getExtensionType() != ET \|\| LD->isVolatile() \|\|
	LD->isNonTemporal())
	return false;
	if (LD->getMemoryVT() != MemVT)
	return false;

	RLI.Ptr = LD->getBasePtr();
	if (LD->isIndexed() && !LD->getOffset().isUndef()) {
	assert(LD->getAddressingMode() == ISD::PRE_INC &&
	"Non-pre-inc AM on PPC?");
	RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
	LD->getOffset());
	}

	RLI.Chain = LD->getChain();
	RLI.MPI = LD->getPointerInfo();
	RLI.IsDereferenceable = LD->isDereferenceable();
	RLI.IsInvariant = LD->isInvariant();
	RLI.Alignment = LD->getAlignment();
	RLI.AAInfo = LD->getAAInfo();
	RLI.Ranges = LD->getRanges();

	RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
	return true;
	}

	// Given the head of the old chain, ResChain, insert a token factor containing
	// it and NewResChain, and make users of ResChain now be users of that token
	// factor.
	// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
	void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
	SDValue NewResChain,
	SelectionDAG &DAG) const {
	if (!ResChain)
	return;

	SDLoc dl(NewResChain);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	NewResChain, DAG.getUNDEF(MVT::Other));
	assert(TF.getNode() != NewResChain.getNode() &&
	"A new TF really is required here");

	DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
	DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
	}

	/// \brief Analyze profitability of direct move
	/// prefer float load to int load plus direct move
	/// when there is no integer use of int load
	bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
	SDNode *Origin = Op.getOperand(0).getNode();
	if (Origin->getOpcode() != ISD::LOAD)
	return true;

	// If there is no LXSIBZX/LXSIHZX, like Power8,
	// prefer direct move if the memory size is 1 or 2 bytes.
	MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
	if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
	return true;

	for (SDNode::use_iterator UI = Origin->use_begin(),
	UE = Origin->use_end();
	UI != UE; ++UI) {

	// Only look at the users of the loaded value.
	if (UI.getUse().get().getResNo() != 0)
	continue;

	if (UI->getOpcode() != ISD::SINT_TO_FP &&
	UI->getOpcode() != ISD::UINT_TO_FP)
	return true;
	}

	return false;
	}

	/// \brief Custom lowers integer to floating point conversions to use
	/// the direct move instructions available in ISA 2.07 to avoid the
	/// need for load/store combinations.
	SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	assert((Op.getValueType() == MVT::f32 \|\|
	Op.getValueType() == MVT::f64) &&
	"Invalid floating point type as target of conversion");
	assert(Subtarget.hasFPCVT() &&
	"Int to FP conversions with direct moves require FPCVT");
	SDValue FP;
	SDValue Src = Op.getOperand(0);
	bool SinglePrec = Op.getValueType() == MVT::f32;
	bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
	bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
	unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
	(SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);

	if (WordInt) {
	FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
	dl, MVT::f64, Src);
	FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
	}
	else {
	FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
	FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
	}

	return FP;
	}

	SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);

	if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
	if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
	return SDValue();

	SDValue Value = Op.getOperand(0);
	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	if (Op.getValueType() != MVT::v4f64)
	Value = DAG.getNode(ISD::FP_ROUND, dl,
	Op.getValueType(), Value,
	DAG.getIntPtrConstant(1, dl));
	return Value;
	}

	// Don't handle ppc_fp128 here; let it be lowered to a libcall.
	if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
	return SDValue();

	if (Op.getOperand(0).getValueType() == MVT::i1)
	return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
	DAG.getConstantFP(1.0, dl, Op.getValueType()),
	DAG.getConstantFP(0.0, dl, Op.getValueType()));

	// If we have direct moves, we can do all the conversion, skip the store/load
	// however, without FPCVT we can't do most conversions.
	if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
	Subtarget.isPPC64() && Subtarget.hasFPCVT())
	return LowerINT_TO_FPDirectMove(Op, DAG, dl);

	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\| Subtarget.hasFPCVT()) &&
	"UINT_TO_FP is supported only with FPCVT");

	// If we have FCFIDS, then use it when converting to single-precision.
	// Otherwise, convert to double-precision and then round.
	unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
	: PPCISD::FCFIDS)
	: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
	: PPCISD::FCFID);
	MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? MVT::f32
	: MVT::f64;

	if (Op.getOperand(0).getValueType() == MVT::i64) {
	SDValue SINT = Op.getOperand(0);
	// When converting to single-precision, we actually need to convert
	// to double-precision first and then round to single-precision.
	// To avoid double-rounding effects during that operation, we have
	// to prepare the input operand. Bits that might be truncated when
	// converting to double-precision are replaced by a bit that won't
	// be lost at this stage, but is below the single-precision rounding
	// position.
	//
	// However, if -enable-unsafe-fp-math is in effect, accept double
	// rounding to avoid the extra overhead.
	if (Op.getValueType() == MVT::f32 &&
	!Subtarget.hasFPCVT() &&
	!DAG.getTarget().Options.UnsafeFPMath) {

	// Twiddle input to make sure the low 11 bits are zero. (If this
	// is the case, we are guaranteed the value will fit into the 53 bit
	// mantissa of an IEEE double-precision value without rounding.)
	// If any of those low 11 bits were not zero originally, make sure
	// bit 12 (value 2048) is set instead, so that the final rounding
	// to single-precision gets the correct result.
	SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
	SINT, DAG.getConstant(2047, dl, MVT::i64));
	Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
	Round, DAG.getConstant(2047, dl, MVT::i64));
	Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
	Round = DAG.getNode(ISD::AND, dl, MVT::i64,
	Round, DAG.getConstant(-2048, dl, MVT::i64));

	// However, we cannot use that value unconditionally: if the magnitude
	// of the input value is small, the bit-twiddling we did above might
	// end up visibly changing the output. Fortunately, in that case, we
	// don't need to twiddle bits since the original input will convert
	// exactly to double-precision floating-point already. Therefore,
	// construct a conditional to use the original value if the top 11
	// bits are all sign-bit copies, and use the rounded value computed
	// above otherwise.
	SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
	SINT, DAG.getConstant(53, dl, MVT::i32));
	Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
	Cond, DAG.getConstant(1, dl, MVT::i64));
	Cond = DAG.getSetCC(dl, MVT::i32,
	Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);

	SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
	}

	ReuseLoadInfo RLI;
	SDValue Bits;

	MachineFunction &MF = DAG.getMachineFunction();
	if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
	Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
	RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (Subtarget.hasLFIWAX() &&
	canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (Subtarget.hasFPCVT() &&
	canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
	} else if (((Subtarget.hasLFIWAX() &&
	SINT.getOpcode() == ISD::SIGN_EXTEND) \|\|
	(Subtarget.hasFPCVT() &&
	SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
	SINT.getOperand(0).getValueType() == MVT::i32) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	int FrameIdx = MFI.CreateStackObject(4, 4, false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FrameIdx));

	assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
	"Expected an i32 store");

	RLI.Ptr = FIdx;
	RLI.Chain = Store;
	RLI.MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	RLI.Alignment = 4;

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
	PPCISD::LFIWZX : PPCISD::LFIWAX,
	dl, DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	} else
	Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);

	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);

	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
	FP = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
	return FP;
	}

	assert(Op.getOperand(0).getValueType() == MVT::i32 &&
	"Unhandled INT_TO_FP type in custom expander!");
	// Since we only generate this in 64-bit mode, we can take advantage of
	// 64-bit registers. In particular, sign extend the input value into the
	// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
	// then lfd it and fcfid it.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	SDValue Ld;
	if (Subtarget.hasLFIWAX() \|\| Subtarget.hasFPCVT()) {
	ReuseLoadInfo RLI;
	bool ReusingLoad;
	if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
	DAG))) {
	int FrameIdx = MFI.CreateStackObject(4, 4, false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FrameIdx));

	assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
	"Expected an i32 store");

	RLI.Ptr = FIdx;
	RLI.Chain = Store;
	RLI.MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	RLI.Alignment = 4;
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
	RLI.Alignment, RLI.AAInfo, RLI.Ranges);
	SDValue Ops[] = { RLI.Chain, RLI.Ptr };
	Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
	PPCISD::LFIWZX : PPCISD::LFIWAX,
	dl, DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i32, MMO);
	if (ReusingLoad)
	spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
	} else {
	assert(Subtarget.isPPC64() &&
	"i32->FP without LFIWAX supported only on PPC64");

	int FrameIdx = MFI.CreateStackObject(8, 8, false);
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
	Op.getOperand(0));

	// STD the extended value into the stack slot.
	SDValue Store = DAG.getStore(
	DAG.getEntryNode(), dl, Ext64, FIdx,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));

	// Load the value as a double.
	Ld = DAG.getLoad(
	MVT::f64, dl, Store, FIdx,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
	}

	// FCFID it and return it.
	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
	FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
	DAG.getIntPtrConstant(0, dl));
	return FP;
	}

	SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	/*
	The rounding mode is in bits 30:31 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to 0
	10 Round to +inf
	11 Round to -inf

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	EVT VT = Op.getValueType();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Save FP Control Word to register
	EVT NodeTys[] = {
	MVT::f64, // return register
	MVT::Glue // unused in this context
	};
	SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);

	// Save FP register to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
	MachinePointerInfo());

	// Load FP Control Word from low 32 bits of stack slot.
	SDValue Four = DAG.getConstant(4, dl, PtrVT);
	SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
	SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::AND, dl, MVT::i32,
	CWD, DAG.getConstant(3, dl, MVT::i32));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, dl, MVT::i32,
	DAG.getNode(ISD::AND, dl, MVT::i32,
	DAG.getNode(ISD::XOR, dl, MVT::i32,
	CWD, DAG.getConstant(3, dl, MVT::i32)),
	DAG.getConstant(3, dl, MVT::i32)),
	DAG.getConstant(1, dl, MVT::i32));

	SDValue RetVal =
	DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
	}

	SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	SDLoc dl(Op);
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SHL!");

	// Expand into a bunch of logical ops. Note that these ops
	// depend on the PPC behavior for oversized shift amounts.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
	SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
	SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	unsigned BitWidth = VT.getSizeInBits();
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SRL!");

	// Expand into a bunch of logical ops. Note that these ops
	// depend on the PPC behavior for oversized shift amounts.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
	SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
	SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	assert(Op.getNumOperands() == 3 &&
	VT == Op.getOperand(1).getValueType() &&
	"Unexpected SRA!");

	// Expand into a bunch of logical ops, followed by a select_cc.
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);
	EVT AmtVT = Amt.getValueType();

	SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
	DAG.getConstant(BitWidth, dl, AmtVT), Amt);
	SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
	SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
	SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
	DAG.getConstant(-BitWidth, dl, AmtVT));
	SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
	SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
	SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
	Tmp4, Tmp6, ISD::SETLE);
	SDValue OutOps[] = { OutLo, OutHi };
	return DAG.getMergeValues(OutOps, dl);
	}

	//===----------------------------------------------------------------------===//
	// Vector related lowering.
	//

	/// BuildSplatI - Build a canonical splati of Val with an element size of
	/// SplatSize. Cast the result to VT.
	static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");

	static const MVT VTys[] = { // canonical VT to use for each size.
	MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
	};

	EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];

	// Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
	if (Val == -1)
	SplatSize = 1;

	EVT CanonicalVT = VTys[SplatSize-1];

	// Build a canonical splat for this value.
	return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
	}

	/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl, EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = Op.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), Op);
	}

	/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG, const SDLoc &dl,
	EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = LHS.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
	}

	/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
	/// specified intrinsic ID.
	static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
	SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
	EVT DestVT = MVT::Other) {
	if (DestVT == MVT::Other) DestVT = Op0.getValueType();
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
	DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
	}

	/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
	/// amount. The result has the specified value type.
	static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
	SelectionDAG &DAG, const SDLoc &dl) {
	// Force LHS/RHS to be the right type.
	LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
	RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);

	int Ops[16];
	for (unsigned i = 0; i != 16; ++i)
	Ops[i] = i + Amt;
	SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
	return DAG.getNode(ISD::BITCAST, dl, VT, T);
	}

	/// Do we have an efficient pattern in a .td file for this node?
	///
	/// \param V - pointer to the BuildVectorSDNode being matched
	/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
	///
	/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
	/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
	/// the opposite is true (expansion is beneficial) are:
	/// - The node builds a vector out of integers that are not 32 or 64-bits
	/// - The node builds a vector out of constants
	/// - The node is a "load-and-splat"
	/// In all other cases, we will choose to keep the BUILD_VECTOR.
	static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
	bool HasDirectMove,
	bool HasP8Vector) {
	EVT VecVT = V->getValueType(0);
	bool RightType = VecVT == MVT::v2f64 \|\|
	(HasP8Vector && VecVT == MVT::v4f32) \|\|
	(HasDirectMove && (VecVT == MVT::v2i64 \|\| VecVT == MVT::v4i32));
	if (!RightType)
	return false;

	bool IsSplat = true;
	bool IsLoad = false;
	SDValue Op0 = V->getOperand(0);

	// This function is called in a block that confirms the node is not a constant
	// splat. So a constant BUILD_VECTOR here means the vector is built out of
	// different constants.
	if (V->isConstant())
	return false;
	for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
	if (V->getOperand(i).isUndef())
	return false;
	// We want to expand nodes that represent load-and-splat even if the
	// loaded value is a floating point truncation or conversion to int.
	if (V->getOperand(i).getOpcode() == ISD::LOAD \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) \|\|
	(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
	V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
	IsLoad = true;
	// If the operands are different or the input is not a load and has more
	// uses than just this BV node, then it isn't a splat.
	if (V->getOperand(i) != Op0 \|\|
	(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
	IsSplat = false;
	}
	return !(IsSplat && IsLoad);
	}

	// If this is a case we can't handle, return null and let the default
	// expansion code take care of it. If we CAN select this case, and if it
	// selects to a single instruction, return Op. Otherwise, if we can codegen
	// this case more efficiently than a constant pool load, lower it to the
	// sequence of ops that should be used.
	SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");

	if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
	// We first build an i32 vector, load it into a QPX register,
	// then convert it to a floating-point vector and compare it
	// to a zero vector to get the boolean result.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	assert(BVN->getNumOperands() == 4 &&
	"BUILD_VECTOR for v4i1 does not have 4 operands");

	bool IsConst = true;
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef()) continue;
	if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
	IsConst = false;
	break;
	}
	}

	if (IsConst) {
	Constant *One =
	ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
	Constant *NegOne =
	ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);

	Constant *CV[4];
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef())
	CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
	else if (isNullConstant(BVN->getOperand(i)))
	CV[i] = NegOne;
	else
	CV[i] = One;
	}

	Constant *CP = ConstantVector::get(CV);
	SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
	16 /* alignment */);

	SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
	SDVTList VTs = DAG.getVTList({MVT::v4i1, /chain/ MVT::Other});
	return DAG.getMemIntrinsicNode(
	PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}

	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i < 4; ++i) {
	if (BVN->getOperand(i).isUndef()) continue;

	unsigned Offset = 4*i;
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
	if (StoreSize > 4) {
	Stores.push_back(
	DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
	PtrInfo.getWithOffset(Offset), MVT::i32));
	} else {
	SDValue StoreValue = BVN->getOperand(i);
	if (StoreSize < 4)
	StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);

	Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
	PtrInfo.getWithOffset(Offset)));
	}
	}

	SDValue StoreChain;
	if (!Stores.empty())
	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
	else
	StoreChain = DAG.getEntryNode();

	// Now load from v4i32 into the QPX register; this will extend it to
	// v4i64 but not yet convert it to a floating point. Nevertheless, this
	// is typed as v4f64 because the QPX register integer states are not
	// explicitly represented.

	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
	FIdx};
	SDVTList VTs = DAG.getVTList({MVT::v4f64, /chain/ MVT::Other});

	SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);
	LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
	LoadedVect);

	SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);

	return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
	}

	// All other QPX vectors are handled by generic code.
	if (Subtarget.hasQPX())
	return SDValue();

	// Check if this is a splat of a constant value.
	APInt APSplatBits, APSplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
	HasAnyUndefs, 0, !Subtarget.isLittleEndian()) \|\|
	SplatBitSize > 32) {
	// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
	// lowered to VSX instructions under certain conditions.
	// Without VSX, there is no pattern more efficient than expanding the node.
	if (Subtarget.hasVSX() &&
	haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
	Subtarget.hasP8Vector()))
	return Op;
	return SDValue();
	}

	unsigned SplatBits = APSplatBits.getZExtValue();
	unsigned SplatUndef = APSplatUndef.getZExtValue();
	unsigned SplatSize = SplatBitSize / 8;

	// First, handle single instruction cases.

	// All zeros?
	if (SplatBits == 0) {
	// Canonicalize all zero vectors to be v4i32.
	if (Op.getValueType() != MVT::v4i32 \|\| HasAnyUndefs) {
	SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
	Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
	}
	return Op;
	}

	// We have XXSPLTIB for constant splats one byte wide
	if (Subtarget.hasP9Vector() && SplatSize == 1) {
	// This is a splat of 1-byte elements with some elements potentially undef.
	// Rather than trying to match undef in the SDAG patterns, ensure that all
	// elements are the same constant.
	if (HasAnyUndefs \|\| ISD::isBuildVectorAllOnes(BVN)) {
	SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits,
	dl, MVT::i32));
	SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops);
	if (Op.getValueType() != MVT::v16i8)
	return DAG.getBitcast(Op.getValueType(), NewBV);
	return NewBV;
	}

	// BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll
	// detect that constant splats like v8i16: 0xABAB are really just splats
	// of a 1-byte constant. In this case, we need to convert the node to a
	// splat of v16i8 and a bitcast.
	if (Op.getValueType() != MVT::v16i8)
	return DAG.getBitcast(Op.getValueType(),
	DAG.getConstant(SplatBits, dl, MVT::v16i8));

	return Op;
	}

	// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
	int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
	(32-SplatBitSize));
	if (SextVal >= -16 && SextVal <= 15)
	return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);

	// Two instruction sequences.

	// If this value is in the range [-32,30] and is even, use:
	// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
	// If this value is in the range [17,31] and is odd, use:
	// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
	// If this value is in the range [-31,-17] and is odd, use:
	// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
	// Note the last two are three-instruction sequences.
	if (SextVal >= -32 && SextVal <= 31) {
	// To avoid having these optimizations undone by constant folding,
	// we convert to a pseudo that will be expanded later into one of
	// the above forms.
	SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
	EVT VT = (SplatSize == 1 ? MVT::v16i8 :
	(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
	SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
	SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
	if (VT == Op.getValueType())
	return RetVal;
	else
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
	}

	// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
	// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
	// for fneg/fabs.
	if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
	// Make -1 and vspltisw -1:
	SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);

	// Make the VSLW intrinsic, computing 0x8000_0000.
	SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
	OnesV, DAG, dl);

	// xor by OnesV to invert it.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// Check to see if this is a wide variety of vsplti*, binop self cases.
	static const signed char SplatCsts[] = {
	-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
	-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
	};

	for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
	// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
	// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
	int i = SplatCsts[idx];

	// Figure out what shift amount will be used by altivec if shifted by i in
	// this splat size.
	unsigned TypeShiftAmt = i & (SplatBitSize-1);

	// vsplti + shl self.
	if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
	Intrinsic::ppc_altivec_vslw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + srl self.
	if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
	Intrinsic::ppc_altivec_vsrw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + sra self.
	if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
	Intrinsic::ppc_altivec_vsraw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// vsplti + rol self.
	if (SextVal == (int)(((unsigned)i << TypeShiftAmt) \|
	((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
	SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
	static const unsigned IIDs[] = { // Intrinsic to use for each size.
	Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
	Intrinsic::ppc_altivec_vrlw
	};
	Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
	}

	// t = vsplti c, result = vsldoi t, t, 1
	if (SextVal == (int)(((unsigned)i << 8) \| (i < 0 ? 0xFF : 0))) {
	SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	// t = vsplti c, result = vsldoi t, t, 2
	if (SextVal == (int)(((unsigned)i << 16) \| (i < 0 ? 0xFFFF : 0))) {
	SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	// t = vsplti c, result = vsldoi t, t, 3
	if (SextVal == (int)(((unsigned)i << 24) \| (i < 0 ? 0xFFFFFF : 0))) {
	SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
	unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
	return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
	}
	}

	return SDValue();
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VMRGHW,
	OP_VMRGLW,
	OP_VSPLTISW0,
	OP_VSPLTISW1,
	OP_VSPLTISW2,
	OP_VSPLTISW3,
	OP_VSLDOI4,
	OP_VSLDOI8,
	OP_VSLDOI12
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (19+2)9+3) return LHS;
	assert(LHSID == ((49+5)9+6)*9+7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);

	int ShufIdxs[16];
	switch (OpNum) {
	default: llvm_unreachable("Unknown i32 permute!");
	case OP_VMRGHW:
	ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
	ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
	ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
	ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
	break;
	case OP_VMRGLW:
	ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
	ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
	ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
	ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
	break;
	case OP_VSPLTISW0:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+0;
	break;
	case OP_VSPLTISW1:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+4;
	break;
	case OP_VSPLTISW2:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+8;
	break;
	case OP_VSPLTISW3:
	for (unsigned i = 0; i != 16; ++i)
	ShufIdxs[i] = (i&3)+12;
	break;
	case OP_VSLDOI4:
	return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
	case OP_VSLDOI8:
	return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
	case OP_VSLDOI12:
	return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
	}
	EVT VT = OpLHS.getValueType();
	OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
	OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
	SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
	return DAG.getNode(ISD::BITCAST, dl, VT, T);
	}

	/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
	/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
	/// SDValue.
	SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
	SelectionDAG &DAG) const {
	const unsigned BytesInVector = 16;
	bool IsLE = Subtarget.isLittleEndian();
	SDLoc dl(N);
	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);
	unsigned ShiftElts = 0, InsertAtByte = 0;
	bool Swap = false;

	// Shifts required to get the byte we want at element 7.
	unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
	0, 15, 14, 13, 12, 11, 10, 9};
	unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
	1, 2, 3, 4, 5, 6, 7, 8};

	ArrayRef<int> Mask = N->getMask();
	int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

	// For each mask element, find out if we're just inserting something
	// from V2 into V1 or vice versa.
	// Possible permutations inserting an element from V2 into V1:
	// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	// ...
	// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
	// Inserting from V1 into V2 will be similar, except mask range will be
	// [16,31].

	bool FoundCandidate = false;
	// If both vector operands for the shuffle are the same vector, the mask
	// will contain only elements from the first one and the second one will be
	// undef.
	unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
	// Go through the mask of half-words to find an element that's being moved
	// from one vector to the other.
	for (unsigned i = 0; i < BytesInVector; ++i) {
	unsigned CurrentElement = Mask[i];
	// If 2nd operand is undefined, we should only look for element 7 in the
	// Mask.
	if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
	continue;

	bool OtherElementsInOrder = true;
	// Examine the other elements in the Mask to see if they're in original
	// order.
	for (unsigned j = 0; j < BytesInVector; ++j) {
	if (j == i)
	continue;
	// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
	// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
	// in which we always assume we're always picking from the 1st operand.
	int MaskOffset =
	(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
	if (Mask[j] != OriginalOrder[j] + MaskOffset) {
	OtherElementsInOrder = false;
	break;
	}
	}
	// If other elements are in original order, we record the number of shifts
	// we need to get the element we want into element 7. Also record which byte
	// in the vector we should insert into.
	if (OtherElementsInOrder) {
	// If 2nd operand is undefined, we assume no shifts and no swapping.
	if (V2.isUndef()) {
	ShiftElts = 0;
	Swap = false;
	} else {
	// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
	ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
	: BigEndianShifts[CurrentElement & 0xF];
	Swap = CurrentElement < BytesInVector;
	}
	InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
	FoundCandidate = true;
	break;
	}
	}

	if (!FoundCandidate)
	return SDValue();

	// Candidate found, construct the proper SDAG sequence with VINSERTB,
	// optionally with VECSHL if shift is required.
	if (Swap)
	std::swap(V1, V2);
	if (V2.isUndef())
	V2 = V1;
	if (ShiftElts) {
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}
	return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}

	/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
	/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
	/// SDValue.
	SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
	SelectionDAG &DAG) const {
	const unsigned NumHalfWords = 8;
	const unsigned BytesInVector = NumHalfWords * 2;
	// Check that the shuffle is on half-words.
	if (!isNByteElemShuffleMask(N, 2, 1))
	return SDValue();

	bool IsLE = Subtarget.isLittleEndian();
	SDLoc dl(N);
	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);
	unsigned ShiftElts = 0, InsertAtByte = 0;
	bool Swap = false;

	// Shifts required to get the half-word we want at element 3.
	unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
	unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};

	uint32_t Mask = 0;
	uint32_t OriginalOrderLow = 0x1234567;
	uint32_t OriginalOrderHigh = 0x89ABCDEF;
	// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
	// 32-bit space, only need 4-bit nibbles per element.
	for (unsigned i = 0; i < NumHalfWords; ++i) {
	unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
	Mask \|= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
	}

	// For each mask element, find out if we're just inserting something
	// from V2 into V1 or vice versa. Possible permutations inserting an element
	// from V2 into V1:
	// X, 1, 2, 3, 4, 5, 6, 7
	// 0, X, 2, 3, 4, 5, 6, 7
	// 0, 1, X, 3, 4, 5, 6, 7
	// 0, 1, 2, X, 4, 5, 6, 7
	// 0, 1, 2, 3, X, 5, 6, 7
	// 0, 1, 2, 3, 4, X, 6, 7
	// 0, 1, 2, 3, 4, 5, X, 7
	// 0, 1, 2, 3, 4, 5, 6, X
	// Inserting from V1 into V2 will be similar, except mask range will be [8,15].

	bool FoundCandidate = false;
	// Go through the mask of half-words to find an element that's being moved
	// from one vector to the other.
	for (unsigned i = 0; i < NumHalfWords; ++i) {
	unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
	uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
	uint32_t MaskOtherElts = ~(0xF << MaskShift);
	uint32_t TargetOrder = 0x0;

	// If both vector operands for the shuffle are the same vector, the mask
	// will contain only elements from the first one and the second one will be
	// undef.
	if (V2.isUndef()) {
	ShiftElts = 0;
	unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
	TargetOrder = OriginalOrderLow;
	Swap = false;
	// Skip if not the correct element or mask of other elements don't equal
	// to our expected order.
	if (MaskOneElt == VINSERTHSrcElem &&
	(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
	InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
	FoundCandidate = true;
	break;
	}
	} else { // If both operands are defined.
	// Target order is [8,15] if the current mask is between [0,7].
	TargetOrder =
	(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
	// Skip if mask of other elements don't equal our expected order.
	if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
	// We only need the last 3 bits for the number of shifts.
	ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
	: BigEndianShifts[MaskOneElt & 0x7];
	InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
	Swap = MaskOneElt < NumHalfWords;
	FoundCandidate = true;
	break;
	}
	}
	}

	if (!FoundCandidate)
	return SDValue();

	// Candidate found, construct the proper SDAG sequence with VINSERTH,
	// optionally with VECSHL if shift is required.
	if (Swap)
	std::swap(V1, V2);
	if (V2.isUndef())
	V2 = V1;
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
	if (ShiftElts) {
	// Double ShiftElts because we're left shifting on v16i8 type.
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
	DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}

	/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
	/// is a shuffle we can handle in a single instruction, return it. Otherwise,
	/// return the code it can be lowered into. Worst case, it can always be
	/// lowered into a vperm.
	SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	EVT VT = Op.getValueType();
	bool isLittleEndian = Subtarget.isLittleEndian();

	unsigned ShiftElts, InsertAtByte;
	bool Swap = false;
	if (Subtarget.hasP9Vector() &&
	PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
	isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
	if (ShiftElts) {
	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}
	SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
	}

	if (Subtarget.hasP9Altivec()) {
	SDValue NewISDNode;
	if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
	return NewISDNode;

	if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
	return NewISDNode;
	}

	if (Subtarget.hasVSX() &&
	PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Conv2 =
	DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);

	SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
	}

	if (Subtarget.hasVSX() &&
	PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
	if (Swap)
	std::swap(V1, V2);
	SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
	SDValue Conv2 =
	DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);

	SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
	DAG.getConstant(ShiftElts, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
	}

	if (Subtarget.hasP9Vector()) {
	if (PPC::isXXBRHShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
	SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
	} else if (PPC::isXXBRWShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
	} else if (PPC::isXXBRDShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
	SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
	} else if (PPC::isXXBRQShuffleMask(SVOp)) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
	SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
	}
	}

	if (Subtarget.hasVSX()) {
	if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
	int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);

	// If the source for the shuffle is a scalar_to_vector that came from a
	// 32-bit load, it will have used LXVWSX so we don't need to splat again.
	if (Subtarget.hasP9Vector() &&
	((isLittleEndian && SplatIdx == 3) \|\|
	(!isLittleEndian && SplatIdx == 0))) {
	SDValue Src = V1.getOperand(0);
	if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	Src.getOperand(0).getOpcode() == ISD::LOAD &&
	Src.getOperand(0).hasOneUse())
	return V1;
	}
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
	SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
	DAG.getConstant(SplatIdx, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
	}

	// Left shifts of 8 bytes are actually swaps. Convert accordingly.
	if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
	SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
	SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
	return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
	}
	}

	if (Subtarget.hasQPX()) {
	if (VT.getVectorNumElements() != 4)
	return SDValue();

	if (V2.isUndef()) V2 = V1;

	int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
	if (AlignIdx != -1) {
	return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
	DAG.getConstant(AlignIdx, dl, MVT::i32));
	} else if (SVOp->isSplat()) {
	int SplatIdx = SVOp->getSplatIndex();
	if (SplatIdx >= 4) {
	std::swap(V1, V2);
	SplatIdx -= 4;
	}

	return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
	DAG.getConstant(SplatIdx, dl, MVT::i32));
	}

	// Lower this into a qvgpci/qvfperm pair.

	// Compute the qvgpci literal
	unsigned idx = 0;
	for (unsigned i = 0; i < 4; ++i) {
	int m = SVOp->getMaskElt(i);
	unsigned mm = m >= 0 ? (unsigned) m : i;
	idx \|= mm << (3-i)*3;
	}

	SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
	DAG.getConstant(idx, dl, MVT::i32));
	return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
	}

	// Cases that are handled by instructions that take permute immediates
	// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
	// selected by the instruction selector.
	if (V2.isUndef()) {
	if (PPC::isSplatShuffleMask(SVOp, 1) \|\|
	PPC::isSplatShuffleMask(SVOp, 2) \|\|
	PPC::isSplatShuffleMask(SVOp, 4) \|\|
	PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 \|\|
	PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) \|\|
	(Subtarget.hasP8Altivec() && (
	PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
	return Op;
	}
	}

	// Altivec has a variety of "shuffle immediates" that take two vector inputs
	// and produce a fixed permutation. If any of these match, do not lower to
	// VPERM.
	unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
	if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 \|\|
	PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) \|\|
	PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) \|\|
	PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) \|\|
	(Subtarget.hasP8Altivec() && (
	PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) \|\|
	PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
	return Op;

	// Check to see if this is a shuffle of 4-byte values. If so, we can use our
	// perfect shuffle table to emit an optimal matching sequence.
	ArrayRef<int> PermMask = SVOp->getMask();

	unsigned PFIndexes[4];
	bool isFourElementShuffle = true;
	for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
	unsigned EltNo = 8; // Start out undef.
	for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
	if (PermMask[i*4+j] < 0)
	continue; // Undef, ignore it.

	unsigned ByteSource = PermMask[i*4+j];
	if ((ByteSource & 3) != j) {
	isFourElementShuffle = false;
	break;
	}

	if (EltNo == 8) {
	EltNo = ByteSource/4;
	} else if (EltNo != ByteSource/4) {
	isFourElementShuffle = false;
	break;
	}
	}
	PFIndexes[i] = EltNo;
	}

	// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
	// perfect shuffle vector to determine if it is cost effective to do this as
	// discrete instructions, or whether we should use a vperm.
	// For now, we skip this for little endian until such time as we have a
	// little-endian perfect shuffle table.
	if (isFourElementShuffle && !isLittleEndian) {
	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex =
	PFIndexes[0]999+PFIndexes[1]99+PFIndexes[2]9+PFIndexes[3];

	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	// Determining when to avoid vperm is tricky. Many things affect the cost
	// of vperm, particularly how many times the perm mask needs to be computed.
	// For example, if the perm mask can be hoisted out of a loop or is already
	// used (perhaps because there are multiple permutes with the same shuffle
	// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
	// the loop requires an extra register.
	//
	// As a compromise, we only emit discrete instructions if the shuffle can be
	// generated in 3 or fewer operations. When we have loop information
	// available, if this block is within a loop, we should avoid using vperm
	// for 3-operation perms and use a constant pool load instead.
	if (Cost < 3)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
	// vector that will get spilled to the constant pool.
	if (V2.isUndef()) V2 = V1;

	// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
	// that it is in input element units, not in bytes. Convert now.

	// For little endian, the order of the input vectors is reversed, and
	// the permutation mask is complemented with respect to 31. This is
	// necessary to produce proper semantics with the big-endian-biased vperm
	// instruction.
	EVT EltVT = V1.getValueType().getVectorElementType();
	unsigned BytesPerElement = EltVT.getSizeInBits()/8;

	SmallVector<SDValue, 16> ResultMask;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];

	for (unsigned j = 0; j != BytesPerElement; ++j)
	if (isLittleEndian)
	ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
	dl, MVT::i32));
	else
	ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
	MVT::i32));
	}

	SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
	if (isLittleEndian)
	return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
	V2, V1, VPermMask);
	else
	return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
	V1, V2, VPermMask);
	}

	/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
	/// vector comparison. If it is, return true and fill in Opc/isDot with
	/// information about the intrinsic.
	static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
	bool &isDot, const PPCSubtarget &Subtarget) {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
	CompareOpc = -1;
	isDot = false;
	switch (IntrinsicID) {
	default:
	return false;
	// Comparison predicates.
	case Intrinsic::ppc_altivec_vcmpbfp_p:
	CompareOpc = 966;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpeqfp_p:
	CompareOpc = 198;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequb_p:
	CompareOpc = 6;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequh_p:
	CompareOpc = 70;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequw_p:
	CompareOpc = 134;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpequd_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 199;
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpneb_p:
	case Intrinsic::ppc_altivec_vcmpneh_p:
	case Intrinsic::ppc_altivec_vcmpnew_p:
	case Intrinsic::ppc_altivec_vcmpnezb_p:
	case Intrinsic::ppc_altivec_vcmpnezh_p:
	case Intrinsic::ppc_altivec_vcmpnezw_p:
	if (Subtarget.hasP9Altivec()) {
	switch (IntrinsicID) {
	default:
	llvm_unreachable("Unknown comparison intrinsic.");
	case Intrinsic::ppc_altivec_vcmpneb_p:
	CompareOpc = 7;
	break;
	case Intrinsic::ppc_altivec_vcmpneh_p:
	CompareOpc = 71;
	break;
	case Intrinsic::ppc_altivec_vcmpnew_p:
	CompareOpc = 135;
	break;
	case Intrinsic::ppc_altivec_vcmpnezb_p:
	CompareOpc = 263;
	break;
	case Intrinsic::ppc_altivec_vcmpnezh_p:
	CompareOpc = 327;
	break;
	case Intrinsic::ppc_altivec_vcmpnezw_p:
	CompareOpc = 391;
	break;
	}
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgefp_p:
	CompareOpc = 454;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtfp_p:
	CompareOpc = 710;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsb_p:
	CompareOpc = 774;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsh_p:
	CompareOpc = 838;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsw_p:
	CompareOpc = 902;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsd_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 967;
	isDot = true;
	} else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgtub_p:
	CompareOpc = 518;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuh_p:
	CompareOpc = 582;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuw_p:
	CompareOpc = 646;
	isDot = true;
	break;
	case Intrinsic::ppc_altivec_vcmpgtud_p:
	if (Subtarget.hasP8Altivec()) {
	CompareOpc = 711;
	isDot = true;
	} else
	return false;
	break;

	// VSX predicate comparisons use the same infrastructure
	case Intrinsic::ppc_vsx_xvcmpeqdp_p:
	case Intrinsic::ppc_vsx_xvcmpgedp_p:
	case Intrinsic::ppc_vsx_xvcmpgtdp_p:
	case Intrinsic::ppc_vsx_xvcmpeqsp_p:
	case Intrinsic::ppc_vsx_xvcmpgesp_p:
	case Intrinsic::ppc_vsx_xvcmpgtsp_p:
	if (Subtarget.hasVSX()) {
	switch (IntrinsicID) {
	case Intrinsic::ppc_vsx_xvcmpeqdp_p:
	CompareOpc = 99;
	break;
	case Intrinsic::ppc_vsx_xvcmpgedp_p:
	CompareOpc = 115;
	break;
	case Intrinsic::ppc_vsx_xvcmpgtdp_p:
	CompareOpc = 107;
	break;
	case Intrinsic::ppc_vsx_xvcmpeqsp_p:
	CompareOpc = 67;
	break;
	case Intrinsic::ppc_vsx_xvcmpgesp_p:
	CompareOpc = 83;
	break;
	case Intrinsic::ppc_vsx_xvcmpgtsp_p:
	CompareOpc = 75;
	break;
	}
	isDot = true;
	} else
	return false;
	break;

	// Normal Comparisons.
	case Intrinsic::ppc_altivec_vcmpbfp:
	CompareOpc = 966;
	break;
	case Intrinsic::ppc_altivec_vcmpeqfp:
	CompareOpc = 198;
	break;
	case Intrinsic::ppc_altivec_vcmpequb:
	CompareOpc = 6;
	break;
	case Intrinsic::ppc_altivec_vcmpequh:
	CompareOpc = 70;
	break;
	case Intrinsic::ppc_altivec_vcmpequw:
	CompareOpc = 134;
	break;
	case Intrinsic::ppc_altivec_vcmpequd:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 199;
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpneb:
	case Intrinsic::ppc_altivec_vcmpneh:
	case Intrinsic::ppc_altivec_vcmpnew:
	case Intrinsic::ppc_altivec_vcmpnezb:
	case Intrinsic::ppc_altivec_vcmpnezh:
	case Intrinsic::ppc_altivec_vcmpnezw:
	if (Subtarget.hasP9Altivec())
	switch (IntrinsicID) {
	default:
	llvm_unreachable("Unknown comparison intrinsic.");
	case Intrinsic::ppc_altivec_vcmpneb:
	CompareOpc = 7;
	break;
	case Intrinsic::ppc_altivec_vcmpneh:
	CompareOpc = 71;
	break;
	case Intrinsic::ppc_altivec_vcmpnew:
	CompareOpc = 135;
	break;
	case Intrinsic::ppc_altivec_vcmpnezb:
	CompareOpc = 263;
	break;
	case Intrinsic::ppc_altivec_vcmpnezh:
	CompareOpc = 327;
	break;
	case Intrinsic::ppc_altivec_vcmpnezw:
	CompareOpc = 391;
	break;
	}
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgefp:
	CompareOpc = 454;
	break;
	case Intrinsic::ppc_altivec_vcmpgtfp:
	CompareOpc = 710;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsb:
	CompareOpc = 774;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsh:
	CompareOpc = 838;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsw:
	CompareOpc = 902;
	break;
	case Intrinsic::ppc_altivec_vcmpgtsd:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 967;
	else
	return false;
	break;
	case Intrinsic::ppc_altivec_vcmpgtub:
	CompareOpc = 518;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuh:
	CompareOpc = 582;
	break;
	case Intrinsic::ppc_altivec_vcmpgtuw:
	CompareOpc = 646;
	break;
	case Intrinsic::ppc_altivec_vcmpgtud:
	if (Subtarget.hasP8Altivec())
	CompareOpc = 711;
	else
	return false;
	break;
	}
	return true;
	}

	/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
	/// lower, do it, otherwise return null.
	SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	SDLoc dl(Op);

	if (IntrinsicID == Intrinsic::thread_pointer) {
	// Reads the thread pointer register, used for __builtin_thread_pointer.
	if (Subtarget.isPPC64())
	return DAG.getRegister(PPC::X13, MVT::i64);
	return DAG.getRegister(PPC::R2, MVT::i32);
	}

	// We are looking for absolute values here.
	// The idea is to try to fit one of two patterns:
	// max (a, (0-a)) OR max ((0-a), a)
	if (Subtarget.hasP9Vector() &&
	(IntrinsicID == Intrinsic::ppc_altivec_vmaxsw \|\|
	IntrinsicID == Intrinsic::ppc_altivec_vmaxsh \|\|
	IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
	SDValue V1 = Op.getOperand(1);
	SDValue V2 = Op.getOperand(2);
	if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
	(V1.getSimpleValueType() == MVT::v4i32 \|\|
	V1.getSimpleValueType() == MVT::v8i16 \|\|
	V1.getSimpleValueType() == MVT::v16i8)) {
	if ( V1.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
	V1.getOperand(1) == V2 ) {
	// Generate the abs instruction with the operands
	return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
	}

	if ( V2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
	V2.getOperand(1) == V1 ) {
	// Generate the abs instruction with the operands
	return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
	}
	}
	}

	// If this is a lowered altivec predicate compare, CompareOpc is set to the
	// opcode number of the comparison.
	int CompareOpc;
	bool isDot;
	if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
	return SDValue(); // Don't custom lower most intrinsics.

	// If this is a non-dot comparison, make the VCMP node and we are done.
	if (!isDot) {
	SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
	Op.getOperand(1), Op.getOperand(2),
	DAG.getConstant(CompareOpc, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
	}

	// Create the PPCISD altivec 'dot' comparison node.
	SDValue Ops[] = {
	Op.getOperand(2), // LHS
	Op.getOperand(3), // RHS
	DAG.getConstant(CompareOpc, dl, MVT::i32)
	};
	EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
	SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);

	// Now that we have the comparison, emit a copy from the CR to a GPR.
	// This is flagged to the above dot comparison.
	SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
	DAG.getRegister(PPC::CR6, MVT::i32),
	CompNode.getValue(1));

	// Unpack the result based on how the target uses it.
	unsigned BitNo; // Bit # of CR6.
	bool InvertBit; // Invert result?
	switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
	default: // Can't happen, don't crash on invalid number though.
	case 0: // Return the value of the EQ bit of CR6.
	BitNo = 0; InvertBit = false;
	break;
	case 1: // Return the inverted value of the EQ bit of CR6.
	BitNo = 0; InvertBit = true;
	break;
	case 2: // Return the value of the LT bit of CR6.
	BitNo = 2; InvertBit = false;
	break;
	case 3: // Return the inverted value of the LT bit of CR6.
	BitNo = 2; InvertBit = true;
	break;
	}

	// Shift the bit into the low position.
	Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
	DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
	// Isolate the bit.
	Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
	DAG.getConstant(1, dl, MVT::i32));

	// If we are supposed to, toggle the bit.
	if (InvertBit)
	Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
	DAG.getConstant(1, dl, MVT::i32));
	return Flags;
	}

	SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
	SelectionDAG &DAG) const {
	// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
	// the beginning of the argument list.
	int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
	SDLoc DL(Op);
	switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
	case Intrinsic::ppc_cfence: {
	assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
	assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
	return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
	DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
	Op.getOperand(ArgStart + 1)),
	Op.getOperand(0)),
	0);
	}
	default:
	break;
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
	// Check for a DIV with the same operands as this REM.
	for (auto UI : Op.getOperand(1)->uses()) {
	if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) \|\|
	(Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
	if (UI->getOperand(0) == Op.getOperand(0) &&
	UI->getOperand(1) == Op.getOperand(1))
	return SDValue();
	}
	return Op;
	}

	// Lower scalar BSWAP64 to xxbrd.
	SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// MTVSRDD
	Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
	Op.getOperand(0));
	// XXBRD
	Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
	// MFVSRD
	int VectorIndex = 0;
	if (Subtarget.isLittleEndian())
	VectorIndex = 1;
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
	DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
	return Op;
	}

	+// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
	+// compared to a value that is atomically loaded (atomic loads zero-extend).
	+SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
	+ SelectionDAG &DAG) const {
	+ assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
	+ "Expecting an atomic compare-and-swap here.");
	+ SDLoc dl(Op);
	+ auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
	+ EVT MemVT = AtomicNode->getMemoryVT();
	+ if (MemVT.getSizeInBits() >= 32)
	+ return Op;
	+
	+ SDValue CmpOp = Op.getOperand(2);
	+ // If this is already correctly zero-extended, leave it alone.
	+ auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
	+ if (DAG.MaskedValueIsZero(CmpOp, HighBits))
	+ return Op;
	+
	+ // Clear the high bits of the compare operand.
	+ unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
	+ SDValue NewCmpOp =
	+ DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
	+ DAG.getConstant(MaskVal, dl, MVT::i32));
	+
	+ // Replace the existing compare operand with the properly zero-extended one.
	+ SmallVector<SDValue, 4> Ops;
	+ for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
	+ Ops.push_back(AtomicNode->getOperand(i));
	+ Ops[2] = NewCmpOp;
	+ MachineMemOperand *MMO = AtomicNode->getMemOperand();
	+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
	+ auto NodeTy =
	+ (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
	+ return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
	+}
	+
	SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
	// instructions), but for smaller types, we need to first extend up to v2i32
	// before doing going farther.
	if (Op.getValueType() == MVT::v2i64) {
	EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	if (ExtVT != MVT::v2i32) {
	Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
	DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
	ExtVT.getVectorElementType(), 4)));
	Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
	Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
	DAG.getValueType(MVT::v2i32));
	}

	return Op;
	}

	return SDValue();
	}

	SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// Create a stack slot that is 16-byte aligned.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	// Store the input value into Value#0 of the stack slot.
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
	MachinePointerInfo());
	// Load it out.
	return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
	"Should only be called for ISD::INSERT_VECTOR_ELT");

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	// We have legal lowering for constant indices but not for variable ones.
	if (!C)
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
	if (VT == MVT::v8i16 \|\| VT == MVT::v16i8) {
	SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
	unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
	unsigned InsertAtElement = C->getZExtValue();
	unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
	if (Subtarget.isLittleEndian()) {
	InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
	}
	return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
	DAG.getConstant(InsertAtByte, dl, MVT::i32));
	}
	return Op;
	}

	SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDNode *N = Op.getNode();

	assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
	"Unknown extract_vector_elt type");

	SDValue Value = N->getOperand(0);

	// The first part of this is like the store lowering except that we don't
	// need to track the chain.

	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
	// understand how to form the extending load.
	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	// Now convert to an integer and store.
	Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
	Value);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue StoreChain = DAG.getEntryNode();
	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
	Value, FIdx};
	SDVTList VTs = DAG.getVTList(/chain/ MVT::Other);

	StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);

	// Extract the value requested.
	unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	SDValue IntVal =
	DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));

	if (!Subtarget.useCRBits())
	return IntVal;

	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
	}

	/// Lowering for QPX v4i1 loads
	SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
	SDValue LoadChain = LN->getChain();
	SDValue BasePtr = LN->getBasePtr();

	if (Op.getValueType() == MVT::v4f64 \|\|
	Op.getValueType() == MVT::v4f32) {
	EVT MemVT = LN->getMemoryVT();
	unsigned Alignment = LN->getAlignment();

	// If this load is properly aligned, then it is legal.
	if (Alignment >= MemVT.getStoreSize())
	return Op;

	EVT ScalarVT = Op.getValueType().getScalarType(),
	ScalarMemVT = MemVT.getScalarType();
	unsigned Stride = ScalarMemVT.getStoreSize();

	SDValue Vals[4], LoadChains[4];
	for (unsigned Idx = 0; Idx < 4; ++Idx) {
	SDValue Load;
	if (ScalarVT != ScalarMemVT)
	Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
	BasePtr,
	LN->getPointerInfo().getWithOffset(Idx * Stride),
	ScalarMemVT, MinAlign(Alignment, Idx * Stride),
	LN->getMemOperand()->getFlags(), LN->getAAInfo());
	else
	Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
	LN->getPointerInfo().getWithOffset(Idx * Stride),
	MinAlign(Alignment, Idx * Stride),
	LN->getMemOperand()->getFlags(), LN->getAAInfo());

	if (Idx == 0 && LN->isIndexed()) {
	assert(LN->getAddressingMode() == ISD::PRE_INC &&
	"Unknown addressing mode on vector load");
	Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
	LN->getAddressingMode());
	}

	Vals[Idx] = Load;
	LoadChains[Idx] = Load.getValue(1);

	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, dl,
	BasePtr.getValueType()));
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
	SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);

	if (LN->isIndexed()) {
	SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
	return DAG.getMergeValues(RetOps, dl);
	}

	SDValue RetOps[] = { Value, TF };
	return DAG.getMergeValues(RetOps, dl);
	}

	assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
	assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");

	// To lower v4i1 from a byte array, we load the byte elements of the
	// vector and then reuse the BUILD_VECTOR logic.

	SDValue VectElmts[4], VectElmtChains[4];
	for (unsigned i = 0; i < 4; ++i) {
	SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);

	VectElmts[i] = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
	LN->getPointerInfo().getWithOffset(i), MVT::i8,
	/* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
	VectElmtChains[i] = VectElmts[i].getValue(1);
	}

	LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
	SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);

	SDValue RVals[] = { Value, LoadChain };
	return DAG.getMergeValues(RVals, dl);
	}

	/// Lowering for QPX v4i1 stores
	SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
	SDValue StoreChain = SN->getChain();
	SDValue BasePtr = SN->getBasePtr();
	SDValue Value = SN->getValue();

	if (Value.getValueType() == MVT::v4f64 \|\|
	Value.getValueType() == MVT::v4f32) {
	EVT MemVT = SN->getMemoryVT();
	unsigned Alignment = SN->getAlignment();

	// If this store is properly aligned, then it is legal.
	if (Alignment >= MemVT.getStoreSize())
	return Op;

	EVT ScalarVT = Value.getValueType().getScalarType(),
	ScalarMemVT = MemVT.getScalarType();
	unsigned Stride = ScalarMemVT.getStoreSize();

	SDValue Stores[4];
	for (unsigned Idx = 0; Idx < 4; ++Idx) {
	SDValue Ex = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
	DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
	SDValue Store;
	if (ScalarVT != ScalarMemVT)
	Store =
	DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
	SN->getPointerInfo().getWithOffset(Idx * Stride),
	ScalarMemVT, MinAlign(Alignment, Idx * Stride),
	SN->getMemOperand()->getFlags(), SN->getAAInfo());
	else
	Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
	SN->getPointerInfo().getWithOffset(Idx * Stride),
	MinAlign(Alignment, Idx * Stride),
	SN->getMemOperand()->getFlags(), SN->getAAInfo());

	if (Idx == 0 && SN->isIndexed()) {
	assert(SN->getAddressingMode() == ISD::PRE_INC &&
	"Unknown addressing mode on vector store");
	Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
	SN->getAddressingMode());
	}

	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, dl,
	BasePtr.getValueType()));
	Stores[Idx] = Store;
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	if (SN->isIndexed()) {
	SDValue RetOps[] = { TF, Stores[0].getValue(1) };
	return DAG.getMergeValues(RetOps, dl);
	}

	return TF;
	}

	assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
	assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");

	// The values are now known to be -1 (false) or 1 (true). To convert this
	// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
	// This can be done with an fma and the 0.5 constant: (V+1.0)0.5 = 0.5V+0.5
	Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);

	// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
	// understand how to form the extending load.
	SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);

	Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);

	// Now convert to an integer and store.
	Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
	DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
	Value);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(16, 16, false);
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

	SDValue Ops[] = {StoreChain,
	DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
	Value, FIdx};
	SDVTList VTs = DAG.getVTList(/chain/ MVT::Other);

	StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
	dl, VTs, Ops, MVT::v4i32, PtrInfo);

	// Move data into the byte array.
	SDValue Loads[4], LoadChains[4];
	for (unsigned i = 0; i < 4; ++i) {
	unsigned Offset = 4*i;
	SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);

	Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
	PtrInfo.getWithOffset(Offset));
	LoadChains[i] = Loads[i].getValue(1);
	}

	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);

	SDValue Stores[4];
	for (unsigned i = 0; i < 4; ++i) {
	SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);

	Stores[i] = DAG.getTruncStore(
	StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
	MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
	SN->getAAInfo());
	}

	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	return StoreChain;
	}

	SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	if (Op.getValueType() == MVT::v4i32) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

	SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
	SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.

	SDValue RHSSwap = // = vrlw RHS, 16
	BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);

	// Shrinkify inputs to v8i16.
	LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
	RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
	RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);

	// Low parts multiplied together, generating 32-bit results (we ignore the
	// top parts).
	SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
	LHS, RHS, DAG, dl, MVT::v4i32);

	SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
	LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
	// Shift the high parts up 16 bits.
	HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
	Neg16, DAG, dl);
	return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
	} else if (Op.getValueType() == MVT::v8i16) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

	SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);

	return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
	LHS, RHS, Zero, DAG, dl);
	} else if (Op.getValueType() == MVT::v16i8) {
	SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
	bool isLittleEndian = Subtarget.isLittleEndian();

	// Multiply the even 8-bit parts, producing 16-bit sums.
	SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
	LHS, RHS, DAG, dl, MVT::v8i16);
	EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);

	// Multiply the odd 8-bit parts, producing 16-bit sums.
	SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
	LHS, RHS, DAG, dl, MVT::v8i16);
	OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);

	// Merge the results together. Because vmuleub and vmuloub are
	// instructions with a big-endian bias, we must reverse the
	// element numbering and reverse the meaning of "odd" and "even"
	// when generating little endian code.
	int Ops[16];
	for (unsigned i = 0; i != 8; ++i) {
	if (isLittleEndian) {
	Ops[i2 ] = 2i;
	Ops[i2+1] = 2i+16;
	} else {
	Ops[i2 ] = 2i+1;
	Ops[i2+1] = 2i+1+16;
	}
	}
	if (isLittleEndian)
	return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
	else
	return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
	} else {
	llvm_unreachable("Unknown mul to lower!");
	}
	}

	/// LowerOperation - Provide custom lowering hooks for some operations.
	///
	SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Wasn't expecting to be able to lower this!");
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);

	case ISD::VAARG:
	return LowerVAARG(Op, DAG);

	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);

	case ISD::STACKRESTORE:
	return LowerSTACKRESTORE(Op, DAG);

	case ISD::DYNAMIC_STACKALLOC:
	return LowerDYNAMIC_STACKALLOC(Op, DAG);

	case ISD::GET_DYNAMIC_AREA_OFFSET:
	return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);

	case ISD::EH_DWARF_CFA:
	return LowerEH_DWARF_CFA(Op, DAG);

	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

	case ISD::LOAD: return LowerLOAD(Op, DAG);
	case ISD::STORE: return LowerSTORE(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG,
	SDLoc(Op));
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

	// Lower 64-bit shifts.
	case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
	case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
	case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);

	// Vector-related lowering.
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
	case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, DAG);

	// For counter-based loop handling.
	case ISD::INTRINSIC_W_CHAIN: return SDValue();

	// Frame & Return address.
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

	case ISD::INTRINSIC_VOID:
	return LowerINTRINSIC_VOID(Op, DAG);
	case ISD::SREM:
	case ISD::UREM:
	return LowerREM(Op, DAG);
	case ISD::BSWAP:
	return LowerBSWAP(Op, DAG);
	+ case ISD::ATOMIC_CMP_SWAP:
	+ return LowerATOMIC_CMP_SWAP(Op, DAG);
	}
	}

	void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case ISD::READCYCLECOUNTER: {
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
	SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));

	Results.push_back(RTB);
	Results.push_back(RTB.getValue(1));
	Results.push_back(RTB.getValue(2));
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
	Intrinsic::ppc_is_decremented_ctr_nonzero)
	break;

	assert(N->getValueType(0) == MVT::i1 &&
	"Unexpected result type for CTR decrement intrinsic");
	EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	N->getValueType(0));
	SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
	SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
	N->getOperand(1));

	Results.push_back(NewInt);
	Results.push_back(NewInt.getValue(1));
	break;
	}
	case ISD::VAARG: {
	if (!Subtarget.isSVR4ABI() \|\| Subtarget.isPPC64())
	return;

	EVT VT = N->getValueType(0);

	if (VT == MVT::i64) {
	SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);

	Results.push_back(NewNode);
	Results.push_back(NewNode.getValue(1));
	}
	return;
	}
	case ISD::FP_ROUND_INREG: {
	assert(N->getValueType(0) == MVT::ppcf128);
	assert(N->getOperand(0).getValueType() == MVT::ppcf128);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
	MVT::f64, N->getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
	MVT::f64, N->getOperand(0),
	DAG.getIntPtrConstant(1, dl));

	// Add the two halves of the long double in round-to-zero mode.
	SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);

	// We know the low half is about to be thrown away, so just use something
	// convenient.
	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
	FPreg, FPreg));
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	// LowerFP_TO_INT() can only handle f32 and f64.
	if (N->getOperand(0).getValueType() == MVT::ppcf128)
	return;
	Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
	return;
	}
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Function *Func = Intrinsic::getDeclaration(M, Id);
	return Builder.CreateCall(Func, {});
	}

	// The mappings for emitLeading/TrailingFence is taken from
	// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
	Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (Ord == AtomicOrdering::SequentiallyConsistent)
	return callIntrinsic(Builder, Intrinsic::ppc_sync);
	if (isReleaseOrStronger(Ord))
	return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
	return nullptr;
	}

	Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
	// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
	// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
	// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
	if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
	return Builder.CreateCall(
	Intrinsic::getDeclaration(
	Builder.GetInsertBlock()->getParent()->getParent(),
	Intrinsic::ppc_cfence, {Inst->getType()}),
	{Inst});
	// FIXME: Can use isync for rmw operation.
	return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
	}
	return nullptr;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
	unsigned AtomicSize,
	unsigned BinOpcode,
	unsigned CmpOpcode,
	unsigned CmpPred) const {
	// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	auto LoadMnemonic = PPC::LDARX;
	auto StoreMnemonic = PPC::STDCX;
	switch (AtomicSize) {
	default:
	llvm_unreachable("Unexpected size of atomic entity");
	case 1:
	LoadMnemonic = PPC::LBARX;
	StoreMnemonic = PPC::STBCX;
	assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
	break;
	case 2:
	LoadMnemonic = PPC::LHARX;
	StoreMnemonic = PPC::STHCX;
	assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
	break;
	case 4:
	LoadMnemonic = PPC::LWARX;
	StoreMnemonic = PPC::STWCX;
	break;
	case 8:
	LoadMnemonic = PPC::LDARX;
	StoreMnemonic = PPC::STDCX;
	break;
	}

	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction *F = BB->getParent();
	MachineFunction::iterator It = ++BB->getIterator();

	unsigned dest = MI.getOperand(0).getReg();
	unsigned ptrA = MI.getOperand(1).getReg();
	unsigned ptrB = MI.getOperand(2).getReg();
	unsigned incr = MI.getOperand(3).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB =
	CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loopMBB);
	if (CmpOpcode)
	F->insert(It, loop2MBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	unsigned TmpReg = (!BinOpcode) ? incr :
	RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
	: &PPC::GPRCRegClass);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);

	// loopMBB:
	// l[wd]arx dest, ptr
	// add r0, dest, incr
	// st[wd]cx. r0, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB

	// For max/min...
	// loopMBB:
	// l[wd]arx dest, ptr
	// cmpl?[wd] incr, dest
	// bgt exitMBB
	// loop2MBB:
	// st[wd]cx. dest, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB

	BB = loopMBB;
	BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
	.addReg(ptrA).addReg(ptrB);
	if (BinOpcode)
	BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
	if (CmpOpcode) {
	// Signed comparisons of byte or halfword values must be sign-extended.
	if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
	unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
	BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
	ExtReg).addReg(dest);
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(incr).addReg(ExtReg);
	} else
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(incr).addReg(dest);

	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(exitMBB);
	BB = loop2MBB;
	}
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	return BB;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI,
	MachineBasicBlock *BB,
	bool is8bit, // operation
	unsigned BinOpcode,
	unsigned CmpOpcode,
	unsigned CmpPred) const {
	// If we support part-word atomic mnemonics, just use them
	if (Subtarget.hasPartwordAtomics())
	return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode,
	CmpOpcode, CmpPred);

	// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// In 64 bit mode we have to use 64 bits for addresses, even though the
	// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
	// registers without caring whether they're 32 or 64, but here we're
	// doing actual arithmetic on the addresses.
	bool is64bit = Subtarget.isPPC64();
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;

	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction *F = BB->getParent();
	MachineFunction::iterator It = ++BB->getIterator();

	unsigned dest = MI.getOperand(0).getReg();
	unsigned ptrA = MI.getOperand(1).getReg();
	unsigned ptrB = MI.getOperand(2).getReg();
	unsigned incr = MI.getOperand(3).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB =
	CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loopMBB);
	if (CmpOpcode)
	F->insert(It, loop2MBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
	: &PPC::GPRCRegClass;
	unsigned PtrReg = RegInfo.createVirtualRegister(RC);
	unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
	unsigned ShiftReg =
	isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
	unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
	unsigned MaskReg = RegInfo.createVirtualRegister(RC);
	unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
	unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
	unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
	unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
	unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
	unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
	unsigned Ptr1Reg;
	unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);

	// The 4-byte load must be aligned, while a char or short may be
	// anywhere in the word. Hence all this nasty bookkeeping code.
	// add ptr1, ptrA, ptrB [copy if ptrA==0]
	// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
	// xori shift, shift1, 24 [16]
	// rlwinm ptr, ptr1, 0, 0, 29
	// slw incr2, incr, shift
	// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
	// slw mask, mask2, shift
	// loopMBB:
	// lwarx tmpDest, ptr
	// add tmp, tmpDest, incr2
	// andc tmp2, tmpDest, mask
	// and tmp3, tmp, mask
	// or tmp4, tmp3, tmp2
	// stwcx. tmp4, ptr
	// bne- loopMBB
	// fallthrough --> exitMBB
	// srw dest, tmpDest, shift
	if (ptrA != ZeroReg) {
	Ptr1Reg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
	.addReg(ptrA).addReg(ptrB);
	} else {
	Ptr1Reg = ptrB;
	}
	BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
	.addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
	if (!isLittleEndian)
	BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
	.addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
	if (is64bit)
	BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
	.addReg(Ptr1Reg).addImm(0).addImm(61);
	else
	BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
	.addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
	BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
	.addReg(incr).addReg(ShiftReg);
	if (is8bit)
	BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
	else {
	BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
	BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
	}
	BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
	.addReg(Mask2Reg).addReg(ShiftReg);

	BB = loopMBB;
	BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
	.addReg(ZeroReg).addReg(PtrReg);
	if (BinOpcode)
	BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
	.addReg(Incr2Reg).addReg(TmpDestReg);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
	.addReg(TmpDestReg).addReg(MaskReg);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
	.addReg(TmpReg).addReg(MaskReg);
	if (CmpOpcode) {
	// For unsigned comparisons, we can directly compare the shifted values.
	// For signed comparisons we shift and sign extend.
	unsigned SReg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg)
	.addReg(TmpDestReg).addReg(MaskReg);
	unsigned ValueReg = SReg;
	unsigned CmpReg = Incr2Reg;
	if (CmpOpcode == PPC::CMPW) {
	ValueReg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
	.addReg(SReg).addReg(ShiftReg);
	unsigned ValueSReg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
	.addReg(ValueReg);
	ValueReg = ValueSReg;
	CmpReg = incr;
	}
	BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
	.addReg(CmpReg).addReg(ValueReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(exitMBB);
	BB = loop2MBB;
	}
	BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
	.addReg(Tmp3Reg).addReg(Tmp2Reg);
	BuildMI(BB, dl, TII->get(PPC::STWCX))
	.addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
	.addReg(ShiftReg);
	return BB;
	}

	llvm::MachineBasicBlock *
	PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();

	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");
	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// SjLjSetup mainMBB
	// bl mainMBB
	// v_restore = 1
	// b sinkMBB
	//
	// mainMBB:
	// buf[LabelOffset] = LR
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// Note that the structure of the jmp_buf used here is not compatible
	// with that used by libc, and is not designed to be. Specifically, it
	// stores only those 'reserved' registers that LLVM does not otherwise
	// understand how to spill. Also, by convention, by the time this
	// intrinsic is called, Clang has already stored the frame address in the
	// first slot of the buffer and stack address in the third. Following the
	// X86 target code, we'll store the jump address in the second slot. We also
	// need to save the TOC pointer (R2) to handle jumps between shared
	// libraries, and that will be stored in the fourth slot. The thread
	// identifier (R13) is not affected.

	// thisMBB:
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t TOCOffset = 3 * PVT.getStoreSize();
	const int64_t BPOffset = 4 * PVT.getStoreSize();

	// Prepare IP either in reg.
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
	unsigned BufReg = MI.getOperand(1).getReg();

	if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
	setUsesTOCBasePtr(*MBB->getParent());
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
	.addReg(PPC::X2)
	.addImm(TOCOffset)
	.addReg(BufReg);
	MIB.setMemRefs(MMOBegin, MMOEnd);
	}

	// Naked functions never have a base pointer, and so we use r1. For all
	// other functions, this decision must be delayed until during PEI.
	unsigned BaseReg;
	if (MF->getFunction().hasFnAttribute(Attribute::Naked))
	BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
	else
	BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;

	MIB = BuildMI(*thisMBB, MI, DL,
	TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
	.addReg(BaseReg)
	.addImm(BPOffset)
	.addReg(BufReg);
	MIB.setMemRefs(MMOBegin, MMOEnd);

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
	MIB.addRegMask(TRI->getNoPreservedMask());

	BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);

	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
	.addMBB(mainMBB);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);

	thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
	thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());

	// mainMBB:
	// mainDstReg = 0
	MIB =
	BuildMI(mainMBB, DL,
	TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);

	// Store IP
	if (Subtarget.isPPC64()) {
	MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
	.addReg(LabelReg)
	.addImm(LabelOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
	.addReg(LabelReg)
	.addImm(LabelOffset)
	.addReg(BufReg);
	}

	MIB.setMemRefs(MMOBegin, MMOEnd);

	BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(PPC::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(thisMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
	unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
	unsigned BP =
	(PVT == MVT::i64)
	? PPC::X30
	: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
	: PPC::R30);

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();
	const int64_t TOCOffset = 3 * PVT.getStoreSize();
	const int64_t BPOffset = 4 * PVT.getStoreSize();

	unsigned BufReg = MI.getOperand(0).getReg();

	// Reload FP (the jumped-to function may not have had a
	// frame pointer, and if so, then its r31 will be restored
	// as necessary).
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
	.addImm(0)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
	.addImm(0)
	.addReg(BufReg);
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);

	// Reload IP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
	.addImm(LabelOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
	.addImm(LabelOffset)
	.addReg(BufReg);
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);

	// Reload SP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
	.addImm(SPOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
	.addImm(SPOffset)
	.addReg(BufReg);
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);

	// Reload BP
	if (PVT == MVT::i64) {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
	.addImm(BPOffset)
	.addReg(BufReg);
	} else {
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
	.addImm(BPOffset)
	.addReg(BufReg);
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);

	// Reload TOC
	if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
	setUsesTOCBasePtr(*MBB->getParent());
	MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
	.addImm(TOCOffset)
	.addReg(BufReg);

	MIB.setMemRefs(MMOBegin, MMOEnd);
	}

	// Jump
	BuildMI(*MBB, MI, DL,
	TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
	BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));

	MI.eraseFromParent();
	return MBB;
	}

	MachineBasicBlock *
	PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	if (MI.getOpcode() == TargetOpcode::STACKMAP \|\|
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
	MI.getOpcode() == TargetOpcode::PATCHPOINT) {
	// Call lowering should have added an r2 operand to indicate a dependence
	// on the TOC base pointer value. It can't however, because there is no
	// way to mark the dependence as implicit there, and so the stackmap code
	// will confuse it with a regular operand. Instead, add the dependence
	// here.
	setUsesTOCBasePtr(*BB->getParent());
	MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
	}

	return emitPatchPoint(MI, BB);
	}

	if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 \|\|
	MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
	return emitEHSjLjSetJmp(MI, BB);
	} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 \|\|
	MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
	return emitEHSjLjLongJmp(MI, BB);
	}

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// To "insert" these instructions we actually have to insert their
	// control-flow patterns.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineFunction *F = BB->getParent();

	if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8 \|\|
	MI.getOpcode() == PPC::SELECT_I4 \|\| MI.getOpcode() == PPC::SELECT_I8) {
	SmallVector<MachineOperand, 2> Cond;
	if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8)
	Cond.push_back(MI.getOperand(4));
	else
	Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
	Cond.push_back(MI.getOperand(1));

	DebugLoc dl = MI.getDebugLoc();
	TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
	MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
	} else if (MI.getOpcode() == PPC::SELECT_CC_I4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_I8 \|\|
	MI.getOpcode() == PPC::SELECT_CC_F4 \|\|
	MI.getOpcode() == PPC::SELECT_CC_F8 \|\|
	MI.getOpcode() == PPC::SELECT_CC_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_CC_VSRC \|\|
	MI.getOpcode() == PPC::SELECT_I4 \|\|
	MI.getOpcode() == PPC::SELECT_I8 \|\|
	MI.getOpcode() == PPC::SELECT_F4 \|\|
	MI.getOpcode() == PPC::SELECT_F8 \|\|
	MI.getOpcode() == PPC::SELECT_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_VSRC) {
	// The incoming instruction knows the destination vreg to set, the
	// condition code register to branch on, the true/false values to
	// select between, and a branch opcode to use.

	// thisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	DebugLoc dl = MI.getDebugLoc();
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Next, add the true and fallthrough blocks as its successors.
	BB->addSuccessor(copy0MBB);
	BB->addSuccessor(sinkMBB);

	if (MI.getOpcode() == PPC::SELECT_I4 \|\| MI.getOpcode() == PPC::SELECT_I8 \|\|
	MI.getOpcode() == PPC::SELECT_F4 \|\| MI.getOpcode() == PPC::SELECT_F8 \|\|
	MI.getOpcode() == PPC::SELECT_QFRC \|\|
	MI.getOpcode() == PPC::SELECT_QSRC \|\|
	MI.getOpcode() == PPC::SELECT_QBRC \|\|
	MI.getOpcode() == PPC::SELECT_VRRC \|\|
	MI.getOpcode() == PPC::SELECT_VSFRC \|\|
	MI.getOpcode() == PPC::SELECT_VSSRC \|\|
	MI.getOpcode() == PPC::SELECT_VSRC) {
	BuildMI(BB, dl, TII->get(PPC::BC))
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	} else {
	unsigned SelectPred = MI.getOperand(4).getImm();
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(SelectPred)
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	}

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	BB = copy0MBB;

	// Update machine-CFG edges
	BB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
	// ...
	BB = sinkMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
	.addReg(MI.getOperand(3).getReg())
	.addMBB(copy0MBB)
	.addReg(MI.getOperand(2).getReg())
	.addMBB(thisMBB);
	} else if (MI.getOpcode() == PPC::ReadTB) {
	// To read the 64-bit time-base register on a 32-bit target, we read the
	// two halves. Should the counter have wrapped while it was being read, we
	// need to try again.
	// ...
	// readLoop:
	// mfspr Rx,TBU # load from TBU
	// mfspr Ry,TB # load from TB
	// mfspr Rz,TBU # load from TBU
	// cmpw crX,Rx,Rz # check if 'old'='new'
	// bne readLoop # branch if they're not equal
	// ...

	MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	DebugLoc dl = MI.getDebugLoc();
	F->insert(It, readMBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(readMBB);
	BB = readMBB;

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
	unsigned LoReg = MI.getOperand(0).getReg();
	unsigned HiReg = MI.getOperand(1).getReg();

	BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
	BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
	BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);

	unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);

	BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
	.addReg(HiReg).addReg(ReadAgainReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB);

	BB->addSuccessor(readMBB);
	BB->addSuccessor(sinkMBB);
	} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
	BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
	BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);

	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
	else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);

	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
	BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
	BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
	BB = EmitAtomicBinary(MI, BB, 4, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
	BB = EmitAtomicBinary(MI, BB, 8, 0);
	else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 \|\|
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 \|\|
	(Subtarget.hasPartwordAtomics() &&
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) \|\|
	(Subtarget.hasPartwordAtomics() &&
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
	bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;

	auto LoadMnemonic = PPC::LDARX;
	auto StoreMnemonic = PPC::STDCX;
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Compare and swap of unknown size");
	case PPC::ATOMIC_CMP_SWAP_I8:
	LoadMnemonic = PPC::LBARX;
	StoreMnemonic = PPC::STBCX;
	assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
	break;
	case PPC::ATOMIC_CMP_SWAP_I16:
	LoadMnemonic = PPC::LHARX;
	StoreMnemonic = PPC::STHCX;
	assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
	break;
	case PPC::ATOMIC_CMP_SWAP_I32:
	LoadMnemonic = PPC::LWARX;
	StoreMnemonic = PPC::STWCX;
	break;
	case PPC::ATOMIC_CMP_SWAP_I64:
	LoadMnemonic = PPC::LDARX;
	StoreMnemonic = PPC::STDCX;
	break;
	}
	unsigned dest = MI.getOperand(0).getReg();
	unsigned ptrA = MI.getOperand(1).getReg();
	unsigned ptrB = MI.getOperand(2).getReg();
	unsigned oldval = MI.getOperand(3).getReg();
	unsigned newval = MI.getOperand(4).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loop1MBB);
	F->insert(It, loop2MBB);
	F->insert(It, midMBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loop1MBB);

	// loop1MBB:
	// l[bhwd]arx dest, ptr
	// cmp[wd] dest, oldval
	// bne- midMBB
	// loop2MBB:
	// st[bhwd]cx. newval, ptr
	// bne- loopMBB
	// b exitBB
	// midMBB:
	// st[bhwd]cx. dest, ptr
	// exitBB:
	BB = loop1MBB;
	BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
	.addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
	.addReg(oldval).addReg(dest);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(midMBB);

	BB = loop2MBB;
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(newval).addReg(ptrA).addReg(ptrB);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
	BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
	BB->addSuccessor(loop1MBB);
	BB->addSuccessor(exitMBB);

	BB = midMBB;
	BuildMI(BB, dl, TII->get(StoreMnemonic))
	.addReg(dest).addReg(ptrA).addReg(ptrB);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 \|\|
	MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
	// We must use 64-bit registers for addresses when targeting 64-bit,
	// since we're actually doing arithmetic on them. Other registers
	// can be 32-bit.
	bool is64bit = Subtarget.isPPC64();
	bool isLittleEndian = Subtarget.isLittleEndian();
	bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;

	unsigned dest = MI.getOperand(0).getReg();
	unsigned ptrA = MI.getOperand(1).getReg();
	unsigned ptrB = MI.getOperand(2).getReg();
	unsigned oldval = MI.getOperand(3).getReg();
	unsigned newval = MI.getOperand(4).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, loop1MBB);
	F->insert(It, loop2MBB);
	F->insert(It, midMBB);
	F->insert(It, exitMBB);
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
	: &PPC::GPRCRegClass;
	unsigned PtrReg = RegInfo.createVirtualRegister(RC);
	unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
	unsigned ShiftReg =
	isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
	unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
	unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
	unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
	unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
	unsigned MaskReg = RegInfo.createVirtualRegister(RC);
	unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
	unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
	unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
	unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
	unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
	unsigned Ptr1Reg;
	unsigned TmpReg = RegInfo.createVirtualRegister(RC);
	unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loop1MBB);

	// The 4-byte load must be aligned, while a char or short may be
	// anywhere in the word. Hence all this nasty bookkeeping code.
	// add ptr1, ptrA, ptrB [copy if ptrA==0]
	// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
	// xori shift, shift1, 24 [16]
	// rlwinm ptr, ptr1, 0, 0, 29
	// slw newval2, newval, shift
	// slw oldval2, oldval,shift
	// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
	// slw mask, mask2, shift
	// and newval3, newval2, mask
	// and oldval3, oldval2, mask
	// loop1MBB:
	// lwarx tmpDest, ptr
	// and tmp, tmpDest, mask
	// cmpw tmp, oldval3
	// bne- midMBB
	// loop2MBB:
	// andc tmp2, tmpDest, mask
	// or tmp4, tmp2, newval3
	// stwcx. tmp4, ptr
	// bne- loop1MBB
	// b exitBB
	// midMBB:
	// stwcx. tmpDest, ptr
	// exitBB:
	// srw dest, tmpDest, shift
	if (ptrA != ZeroReg) {
	Ptr1Reg = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
	.addReg(ptrA).addReg(ptrB);
	} else {
	Ptr1Reg = ptrB;
	}
	BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
	.addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
	if (!isLittleEndian)
	BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
	.addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
	if (is64bit)
	BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
	.addReg(Ptr1Reg).addImm(0).addImm(61);
	else
	BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
	.addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
	BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
	.addReg(newval).addReg(ShiftReg);
	BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
	.addReg(oldval).addReg(ShiftReg);
	if (is8bit)
	BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
	else {
	BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
	BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
	.addReg(Mask3Reg).addImm(65535);
	}
	BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
	.addReg(Mask2Reg).addReg(ShiftReg);
	BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
	.addReg(NewVal2Reg).addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
	.addReg(OldVal2Reg).addReg(MaskReg);

	BB = loop1MBB;
	BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
	.addReg(ZeroReg).addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
	.addReg(TmpDestReg).addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
	.addReg(TmpReg).addReg(OldVal3Reg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
	BB->addSuccessor(loop2MBB);
	BB->addSuccessor(midMBB);

	BB = loop2MBB;
	BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
	.addReg(TmpDestReg).addReg(MaskReg);
	BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
	.addReg(Tmp2Reg).addReg(NewVal3Reg);
	BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
	.addReg(ZeroReg).addReg(PtrReg);
	BuildMI(BB, dl, TII->get(PPC::BCC))
	.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
	BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
	BB->addSuccessor(loop1MBB);
	BB->addSuccessor(exitMBB);

	BB = midMBB;
	BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
	.addReg(ZeroReg).addReg(PtrReg);
	BB->addSuccessor(exitMBB);

	// exitMBB:
	// ...
	BB = exitMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
	.addReg(ShiftReg);
	} else if (MI.getOpcode() == PPC::FADDrtz) {
	// This pseudo performs an FADD with rounding mode temporarily forced
	// to round-to-zero. We emit this via custom inserter since the FPSCR
	// is not modeled at the SelectionDAG level.
	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Src1 = MI.getOperand(1).getReg();
	unsigned Src2 = MI.getOperand(2).getReg();
	DebugLoc dl = MI.getDebugLoc();

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);

	// Save FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);

	// Set rounding mode to round-to-zero.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);

	// Perform addition.
	BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);

	// Restore FPSCR value.
	BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
	} else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT \|\|
	MI.getOpcode() == PPC::ANDIo_1_GT_BIT \|\|
	MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 \|\|
	MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) {
	unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 \|\|
	MI.getOpcode() == PPC::ANDIo_1_GT_BIT8)
	? PPC::ANDIo8
	: PPC::ANDIo;
	bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT \|\|
	MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);

	MachineRegisterInfo &RegInfo = F->getRegInfo();
	unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
	&PPC::GPRCRegClass :
	&PPC::G8RCRegClass);

	DebugLoc dl = MI.getDebugLoc();
	BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
	.addReg(MI.getOperand(1).getReg())
	.addImm(1);
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
	MI.getOperand(0).getReg())
	.addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
	} else if (MI.getOpcode() == PPC::TCHECK_RET) {
	DebugLoc Dl = MI.getDebugLoc();
	MachineRegisterInfo &RegInfo = F->getRegInfo();
	unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
	BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
	return BB;
	} else {
	llvm_unreachable("Unexpected instr type to insert");
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	//===----------------------------------------------------------------------===//
	// Target Optimization Hooks
	//===----------------------------------------------------------------------===//

	static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
	// For the estimates, convergence is quadratic, so we essentially double the
	// number of digits correct after every iteration. For both FRE and FRSQRTE,
	// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
	// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
	int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
	if (VT.getScalarType() == MVT::f64)
	RefinementSteps++;
	return RefinementSteps;
	}

	SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Operand.getValueType();
	if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) \|\|
	(VT == MVT::f64 && Subtarget.hasFRSQRTE()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasAltivec()) \|\|
	(VT == MVT::v2f64 && Subtarget.hasVSX()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasQPX()) \|\|
	(VT == MVT::v4f64 && Subtarget.hasQPX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);

	UseOneConstNR = true;
	return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
	}
	return SDValue();
	}

	SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Operand.getValueType();
	if ((VT == MVT::f32 && Subtarget.hasFRES()) \|\|
	(VT == MVT::f64 && Subtarget.hasFRE()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasAltivec()) \|\|
	(VT == MVT::v2f64 && Subtarget.hasVSX()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasQPX()) \|\|
	(VT == MVT::v4f64 && Subtarget.hasQPX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
	return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
	}
	return SDValue();
	}

	unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
	// Note: This functionality is used only when unsafe-fp-math is enabled, and
	// on cores with reciprocal estimates (which are used when unsafe-fp-math is
	// enabled for division), this functionality is redundant with the default
	// combiner logic (once the division -> reciprocal/multiply transformation
	// has taken place). As a result, this matters more for older cores than for
	// newer ones.

	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are two or more FDIVs (for embedded cores with only
	// one FP pipeline) for three or more FDIVs (for generic OOO cores).
	switch (Subtarget.getDarwinDirective()) {
	default:
	return 3;
	case PPC::DIR_440:
	case PPC::DIR_A2:
	case PPC::DIR_E500mc:
	case PPC::DIR_E5500:
	return 2;
	}
	}

	// isConsecutiveLSLoc needs to work even if all adds have not yet been
	// collapsed, and so we need to look through chains of them.
	static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
	int64_t& Offset, SelectionDAG &DAG) {
	if (DAG.isBaseWithConstantOffset(Loc)) {
	Base = Loc.getOperand(0);
	Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();

	// The base might itself be a base plus an offset, and if so, accumulate
	// that as well.
	getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
	}
	}

	static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
	unsigned Bytes, int Dist,
	SelectionDAG &DAG) {
	if (VT.getSizeInBits() / 8 != Bytes)
	return false;

	SDValue BaseLoc = Base->getBasePtr();
	if (Loc.getOpcode() == ISD::FrameIndex) {
	if (BaseLoc.getOpcode() != ISD::FrameIndex)
	return false;
	const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
	int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
	int FS = MFI.getObjectSize(FI);
	int BFS = MFI.getObjectSize(BFI);
	if (FS != BFS \|\| FS != (int)Bytes) return false;
	return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
	}

	SDValue Base1 = Loc, Base2 = BaseLoc;
	int64_t Offset1 = 0, Offset2 = 0;
	getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
	getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
	if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
	return true;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const GlobalValue *GV1 = nullptr;
	const GlobalValue *GV2 = nullptr;
	Offset1 = 0;
	Offset2 = 0;
	bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
	bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
	if (isGA1 && isGA2 && GV1 == GV2)
	return Offset1 == (Offset2 + Dist*Bytes);
	return false;
	}

	// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
	// not enforce equality of the chain operands.
	static bool isConsecutiveLS(SDNode N, LSBaseSDNode Base,
	unsigned Bytes, int Dist,
	SelectionDAG &DAG) {
	if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
	EVT VT = LS->getMemoryVT();
	SDValue Loc = LS->getBasePtr();
	return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
	}

	if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
	EVT VT;
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default: return false;
	case Intrinsic::ppc_qpx_qvlfd:
	case Intrinsic::ppc_qpx_qvlfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfs:
	case Intrinsic::ppc_qpx_qvlfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcd:
	case Intrinsic::ppc_qpx_qvlfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcs:
	case Intrinsic::ppc_qpx_qvlfcsa:
	VT = MVT::v2f32;
	break;
	case Intrinsic::ppc_qpx_qvlfiwa:
	case Intrinsic::ppc_qpx_qvlfiwz:
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvw4x_be:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_lxvd2x:
	case Intrinsic::ppc_vsx_lxvd2x_be:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_altivec_lvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_lvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_lvewx:
	VT = MVT::i32;
	break;
	}

	return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
	}

	if (N->getOpcode() == ISD::INTRINSIC_VOID) {
	EVT VT;
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default: return false;
	case Intrinsic::ppc_qpx_qvstfd:
	case Intrinsic::ppc_qpx_qvstfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	case Intrinsic::ppc_qpx_qvstfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcd:
	case Intrinsic::ppc_qpx_qvstfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcs:
	case Intrinsic::ppc_qpx_qvstfcsa:
	VT = MVT::v2f32;
	break;
	case Intrinsic::ppc_qpx_qvstfiw:
	case Intrinsic::ppc_qpx_qvstfiwa:
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	case Intrinsic::ppc_vsx_stxvw4x:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_vsx_stxvw4x_be:
	VT = MVT::v4i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x_be:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_altivec_stvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_stvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_stvewx:
	VT = MVT::i32;
	break;
	}

	return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
	}

	return false;
	}

	// Return true is there is a nearyby consecutive load to the one provided
	// (regardless of alignment). We search up and down the chain, looking though
	// token factors and other loads (but nothing else). As a result, a true result
	// indicates that it is safe to create a new consecutive load adjacent to the
	// load provided.
	static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
	SDValue Chain = LD->getChain();
	EVT VT = LD->getMemoryVT();

	SmallSet<SDNode *, 16> LoadRoots;
	SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
	SmallSet<SDNode *, 16> Visited;

	// First, search up the chain, branching to follow all token-factor operands.
	// If we find a consecutive load, then we're done, otherwise, record all
	// nodes just above the top-level loads and token factors.
	while (!Queue.empty()) {
	SDNode *ChainNext = Queue.pop_back_val();
	if (!Visited.insert(ChainNext).second)
	continue;

	if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
	if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
	return true;

	if (!Visited.count(ChainLD->getChain().getNode()))
	Queue.push_back(ChainLD->getChain().getNode());
	} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
	for (const SDUse &O : ChainNext->ops())
	if (!Visited.count(O.getNode()))
	Queue.push_back(O.getNode());
	} else
	LoadRoots.insert(ChainNext);
	}

	// Second, search down the chain, starting from the top-level nodes recorded
	// in the first phase. These top-level nodes are the nodes just above all
	// loads and token factors. Starting with their uses, recursively look though
	// all loads (just the chain uses) and token factors to find a consecutive
	// load.
	Visited.clear();
	Queue.clear();

	for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
	IE = LoadRoots.end(); I != IE; ++I) {
	Queue.push_back(*I);

	while (!Queue.empty()) {
	SDNode *LoadRoot = Queue.pop_back_val();
	if (!Visited.insert(LoadRoot).second)
	continue;

	if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
	if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
	return true;

	for (SDNode::use_iterator UI = LoadRoot->use_begin(),
	UE = LoadRoot->use_end(); UI != UE; ++UI)
	if (((isa<MemSDNode>(*UI) &&
	cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) \|\|
	UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
	Queue.push_back(*UI);
	}
	}

	return false;
	}

	/// This function is called when we have proved that a SETCC node can be replaced
	/// by subtraction (and other supporting instructions) so that the result of
	/// comparison is kept in a GPR instead of CR. This function is purely for
	/// codegen purposes and has some flags to guide the codegen process.
	static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
	bool Swap, SDLoc &DL, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");

	// Zero extend the operands to the largest legal integer. Originally, they
	// must be of a strictly smaller size.
	auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
	DAG.getConstant(Size, DL, MVT::i32));
	auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
	DAG.getConstant(Size, DL, MVT::i32));

	// Swap if needed. Depends on the condition code.
	if (Swap)
	std::swap(Op0, Op1);

	// Subtract extended integers.
	auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);

	// Move the sign bit to the least significant position and zero out the rest.
	// Now the least significant bit carries the result of original comparison.
	auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
	DAG.getConstant(Size - 1, DL, MVT::i32));
	auto Final = Shifted;

	// Complement the result if needed. Based on the condition code.
	if (Complement)
	Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
	DAG.getConstant(1, DL, MVT::i64));

	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
	}

	SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	// Size of integers being compared has a critical role in the following
	// analysis, so we prefer to do this when all types are legal.
	if (!DCI.isAfterLegalizeVectorOps())
	return SDValue();

	// If all users of SETCC extend its value to a legal integer type
	// then we replace SETCC with a subtraction
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); UI != UE; ++UI) {
	if (UI->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	}

	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	auto OpSize = N->getOperand(0).getValueSizeInBits();

	unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();

	if (OpSize < Size) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	return generateEquivalentSub(N, Size, false, false, DL, DAG);
	case ISD::SETULE:
	return generateEquivalentSub(N, Size, true, true, DL, DAG);
	case ISD::SETUGT:
	return generateEquivalentSub(N, Size, false, true, DL, DAG);
	case ISD::SETUGE:
	return generateEquivalentSub(N, Size, true, false, DL, DAG);
	}
	}

	return SDValue();
	}

	SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
	// If we're tracking CR bits, we need to be careful that we don't have:
	// trunc(binary-ops(zext(x), zext(y)))
	// or
	// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
	// such that we're unnecessarily moving things into GPRs when it would be
	// better to keep them in CR bits.

	// Note that trunc here can be an actual i1 trunc, or can be the effective
	// truncation that comes from a setcc or select_cc.
	if (N->getOpcode() == ISD::TRUNCATE &&
	N->getValueType(0) != MVT::i1)
	return SDValue();

	if (N->getOperand(0).getValueType() != MVT::i32 &&
	N->getOperand(0).getValueType() != MVT::i64)
	return SDValue();

	if (N->getOpcode() == ISD::SETCC \|\|
	N->getOpcode() == ISD::SELECT_CC) {
	// If we're looking at a comparison, then we need to make sure that the
	// high bits (all except for the first) don't matter the result.
	ISD::CondCode CC =
	cast<CondCodeSDNode>(N->getOperand(
	N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
	unsigned OpBits = N->getOperand(0).getValueSizeInBits();

	if (ISD::isSignedIntSetCC(CC)) {
	if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits \|\|
	DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
	return SDValue();
	} else if (ISD::isUnsignedIntSetCC(CC)) {
	if (!DAG.MaskedValueIsZero(N->getOperand(0),
	APInt::getHighBitsSet(OpBits, OpBits-1)) \|\|
	!DAG.MaskedValueIsZero(N->getOperand(1),
	APInt::getHighBitsSet(OpBits, OpBits-1)))
	return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
	: SDValue());
	} else {
	// This is neither a signed nor an unsigned comparison, just make sure
	// that the high bits are equal.
	KnownBits Op1Known, Op2Known;
	DAG.computeKnownBits(N->getOperand(0), Op1Known);
	DAG.computeKnownBits(N->getOperand(1), Op2Known);

	// We don't really care about what is known about the first bit (if
	// anything), so clear it in all masks prior to comparing them.
	Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
	Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);

	if (Op1Known.Zero != Op2Known.Zero \|\| Op1Known.One != Op2Known.One)
	return SDValue();
	}
	}

	// We now know that the higher-order bits are irrelevant, we just need to
	// make sure that all of the intermediate operations are bit operations, and
	// all inputs are extensions.
	if (N->getOperand(0).getOpcode() != ISD::AND &&
	N->getOperand(0).getOpcode() != ISD::OR &&
	N->getOperand(0).getOpcode() != ISD::XOR &&
	N->getOperand(0).getOpcode() != ISD::SELECT &&
	N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
	N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
	N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
	N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
	return SDValue();

	if ((N->getOpcode() == ISD::SETCC \|\| N->getOpcode() == ISD::SELECT_CC) &&
	N->getOperand(1).getOpcode() != ISD::AND &&
	N->getOperand(1).getOpcode() != ISD::OR &&
	N->getOperand(1).getOpcode() != ISD::XOR &&
	N->getOperand(1).getOpcode() != ISD::SELECT &&
	N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
	N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
	N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
	N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
	N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
	return SDValue();

	SmallVector<SDValue, 4> Inputs;
	SmallVector<SDValue, 8> BinOps, PromOps;
	SmallPtrSet<SDNode *, 16> Visited;

	for (unsigned i = 0; i < 2; ++i) {
	if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
	N->getOperand(i).getOperand(0).getValueType() == MVT::i1) \|\|
	isa<ConstantSDNode>(N->getOperand(i)))
	Inputs.push_back(N->getOperand(i));
	else
	BinOps.push_back(N->getOperand(i));

	if (N->getOpcode() == ISD::TRUNCATE)
	break;
	}

	// Visit all inputs, collect all binary operations (and, or, xor and
	// select) that are all fed by extensions.
	while (!BinOps.empty()) {
	SDValue BinOp = BinOps.back();
	BinOps.pop_back();

	if (!Visited.insert(BinOp.getNode()).second)
	continue;

	PromOps.push_back(BinOp);

	for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
	// The condition of the select is not promoted.
	if (BinOp.getOpcode() == ISD::SELECT && i == 0)
	continue;
	if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
	continue;

	if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
	BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) \|\|
	isa<ConstantSDNode>(BinOp.getOperand(i))) {
	Inputs.push_back(BinOp.getOperand(i));
	} else if (BinOp.getOperand(i).getOpcode() == ISD::AND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::OR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::XOR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC \|\|
	BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
	BinOps.push_back(BinOp.getOperand(i));
	} else {
	// We have an input that is not an extension or another binary
	// operation; we'll abort this transformation.
	return SDValue();
	}
	}
	}

	// Make sure that this is a self-contained cluster of operations (which
	// is not quite the same thing as saying that everything has only one
	// use).
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
	UE = Inputs[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// Make sure that we're not going to promote the non-output-value
	// operand(s) or SELECT or SELECT_CC.
	// FIXME: Although we could sometimes handle this, and it does occur in
	// practice that one of the condition inputs to the select is also one of
	// the outputs, we currently can't deal with this.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == Inputs[i])
	return SDValue();
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == Inputs[i] \|\|
	User->getOperand(1) == Inputs[i])
	return SDValue();
	}
	}
	}

	for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
	for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
	UE = PromOps[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// Make sure that we're not going to promote the non-output-value
	// operand(s) or SELECT or SELECT_CC.
	// FIXME: Although we could sometimes handle this, and it does occur in
	// practice that one of the condition inputs to the select is also one of
	// the outputs, we currently can't deal with this.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == PromOps[i])
	return SDValue();
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == PromOps[i] \|\|
	User->getOperand(1) == PromOps[i])
	return SDValue();
	}
	}
	}

	// Replace all inputs with the extension operand.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	// Constants may have users outside the cluster of to-be-promoted nodes,
	// and so we need to replace those as we do the promotions.
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;
	else
	DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
	}

	std::list<HandleSDNode> PromOpHandles;
	for (auto &PromOp : PromOps)
	PromOpHandles.emplace_back(PromOp);

	// Replace all operations (these are all the same, but have a different
	// (i1) return type). DAG.getNode will validate that the types of
	// a binary operator match, so go through the list in reverse so that
	// we've likely promoted both operands first. Any intermediate truncations or
	// extensions disappear.
	while (!PromOpHandles.empty()) {
	SDValue PromOp = PromOpHandles.back().getValue();
	PromOpHandles.pop_back();

	if (PromOp.getOpcode() == ISD::TRUNCATE \|\|
	PromOp.getOpcode() == ISD::SIGN_EXTEND \|\|
	PromOp.getOpcode() == ISD::ZERO_EXTEND \|\|
	PromOp.getOpcode() == ISD::ANY_EXTEND) {
	if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
	PromOp.getOperand(0).getValueType() != MVT::i1) {
	// The operand is not yet ready (see comment below).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	SDValue RepValue = PromOp.getOperand(0);
	if (isa<ConstantSDNode>(RepValue))
	RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);

	DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
	continue;
	}

	unsigned C;
	switch (PromOp.getOpcode()) {
	default: C = 0; break;
	case ISD::SELECT: C = 1; break;
	case ISD::SELECT_CC: C = 2; break;
	}

	if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
	PromOp.getOperand(C).getValueType() != MVT::i1) \|\|
	(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
	PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
	// The to-be-promoted operands of this node have not yet been
	// promoted (this should be rare because we're going through the
	// list backward, but if one of the operands has several users in
	// this cluster of to-be-promoted nodes, it is possible).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
	PromOp.getNode()->op_end());

	// If there are any constant inputs, make sure they're replaced now.
	for (unsigned i = 0; i < 2; ++i)
	if (isa<ConstantSDNode>(Ops[C+i]))
	Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);

	DAG.ReplaceAllUsesOfValueWith(PromOp,
	DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
	}

	// Now we're left with the initial truncation itself.
	if (N->getOpcode() == ISD::TRUNCATE)
	return N->getOperand(0);

	// Otherwise, this is a comparison. The operands to be compared have just
	// changed type (to i1), but everything else is the same.
	return SDValue(N, 0);
	}

	SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	// If we're tracking CR bits, we need to be careful that we don't have:
	// zext(binary-ops(trunc(x), trunc(y)))
	// or
	// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
	// such that we're unnecessarily moving things into CR bits that can more
	// efficiently stay in GPRs. Note that if we're not certain that the high
	// bits are set as required by the final extension, we still may need to do
	// some masking to get the proper behavior.

	// This same functionality is important on PPC64 when dealing with
	// 32-to-64-bit extensions; these occur often when 32-bit values are used as
	// the return values of functions. Because it is so similar, it is handled
	// here as well.

	if (N->getValueType(0) != MVT::i32 &&
	N->getValueType(0) != MVT::i64)
	return SDValue();

	if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) \|\|
	(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
	return SDValue();

	if (N->getOperand(0).getOpcode() != ISD::AND &&
	N->getOperand(0).getOpcode() != ISD::OR &&
	N->getOperand(0).getOpcode() != ISD::XOR &&
	N->getOperand(0).getOpcode() != ISD::SELECT &&
	N->getOperand(0).getOpcode() != ISD::SELECT_CC)
	return SDValue();

	SmallVector<SDValue, 4> Inputs;
	SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
	SmallPtrSet<SDNode *, 16> Visited;

	// Visit all inputs, collect all binary operations (and, or, xor and
	// select) that are all fed by truncations.
	while (!BinOps.empty()) {
	SDValue BinOp = BinOps.back();
	BinOps.pop_back();

	if (!Visited.insert(BinOp.getNode()).second)
	continue;

	PromOps.push_back(BinOp);

	for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
	// The condition of the select is not promoted.
	if (BinOp.getOpcode() == ISD::SELECT && i == 0)
	continue;
	if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
	continue;

	if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE \|\|
	isa<ConstantSDNode>(BinOp.getOperand(i))) {
	Inputs.push_back(BinOp.getOperand(i));
	} else if (BinOp.getOperand(i).getOpcode() == ISD::AND \|\|
	BinOp.getOperand(i).getOpcode() == ISD::OR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::XOR \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT \|\|
	BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
	BinOps.push_back(BinOp.getOperand(i));
	} else {
	// We have an input that is not a truncation or another binary
	// operation; we'll abort this transformation.
	return SDValue();
	}
	}
	}

	// The operands of a select that must be truncated when the select is
	// promoted because the operand is actually part of the to-be-promoted set.
	DenseMap<SDNode *, EVT> SelectTruncOp[2];

	// Make sure that this is a self-contained cluster of operations (which
	// is not quite the same thing as saying that everything has only one
	// use).
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
	UE = Inputs[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// If we're going to promote the non-output-value operand(s) or SELECT or
	// SELECT_CC, record them for truncation.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == Inputs[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == Inputs[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	if (User->getOperand(1) == Inputs[i])
	SelectTruncOp[1].insert(std::make_pair(User,
	User->getOperand(1).getValueType()));
	}
	}
	}

	for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
	for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
	UE = PromOps[i].getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User != N && !Visited.count(User))
	return SDValue();

	// If we're going to promote the non-output-value operand(s) or SELECT or
	// SELECT_CC, record them for truncation.
	if (User->getOpcode() == ISD::SELECT) {
	if (User->getOperand(0) == PromOps[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	} else if (User->getOpcode() == ISD::SELECT_CC) {
	if (User->getOperand(0) == PromOps[i])
	SelectTruncOp[0].insert(std::make_pair(User,
	User->getOperand(0).getValueType()));
	if (User->getOperand(1) == PromOps[i])
	SelectTruncOp[1].insert(std::make_pair(User,
	User->getOperand(1).getValueType()));
	}
	}
	}

	unsigned PromBits = N->getOperand(0).getValueSizeInBits();
	bool ReallyNeedsExt = false;
	if (N->getOpcode() != ISD::ANY_EXTEND) {
	// If all of the inputs are not already sign/zero extended, then
	// we'll still need to do that at the end.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	unsigned OpBits =
	Inputs[i].getOperand(0).getValueSizeInBits();
	assert(PromBits < OpBits && "Truncation not to a smaller bit count?");

	if ((N->getOpcode() == ISD::ZERO_EXTEND &&
	!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
	APInt::getHighBitsSet(OpBits,
	OpBits-PromBits))) \|\|
	(N->getOpcode() == ISD::SIGN_EXTEND &&
	DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
	(OpBits-(PromBits-1)))) {
	ReallyNeedsExt = true;
	break;
	}
	}
	}

	// Replace all inputs, either with the truncation operand, or a
	// truncation or extension to the final output type.
	for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
	// Constant inputs need to be replaced with the to-be-promoted nodes that
	// use them because they might have users outside of the cluster of
	// promoted nodes.
	if (isa<ConstantSDNode>(Inputs[i]))
	continue;

	SDValue InSrc = Inputs[i].getOperand(0);
	if (Inputs[i].getValueType() == N->getValueType(0))
	DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
	else if (N->getOpcode() == ISD::SIGN_EXTEND)
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
	else
	DAG.ReplaceAllUsesOfValueWith(Inputs[i],
	DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
	}

	std::list<HandleSDNode> PromOpHandles;
	for (auto &PromOp : PromOps)
	PromOpHandles.emplace_back(PromOp);

	// Replace all operations (these are all the same, but have a different
	// (promoted) return type). DAG.getNode will validate that the types of
	// a binary operator match, so go through the list in reverse so that
	// we've likely promoted both operands first.
	while (!PromOpHandles.empty()) {
	SDValue PromOp = PromOpHandles.back().getValue();
	PromOpHandles.pop_back();

	unsigned C;
	switch (PromOp.getOpcode()) {
	default: C = 0; break;
	case ISD::SELECT: C = 1; break;
	case ISD::SELECT_CC: C = 2; break;
	}

	if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
	PromOp.getOperand(C).getValueType() != N->getValueType(0)) \|\|
	(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
	PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
	// The to-be-promoted operands of this node have not yet been
	// promoted (this should be rare because we're going through the
	// list backward, but if one of the operands has several users in
	// this cluster of to-be-promoted nodes, it is possible).
	PromOpHandles.emplace_front(PromOp);
	continue;
	}

	// For SELECT and SELECT_CC nodes, we do a similar check for any
	// to-be-promoted comparison inputs.
	if (PromOp.getOpcode() == ISD::SELECT \|\|
	PromOp.getOpcode() == ISD::SELECT_CC) {
	if ((SelectTruncOp[0].count(PromOp.getNode()) &&
	PromOp.getOperand(0).getValueType() != N->getValueType(0)) \|\|
	(SelectTruncOp[1].count(PromOp.getNode()) &&
	PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
	PromOpHandles.emplace_front(PromOp);
	continue;
	}
	}

	SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
	PromOp.getNode()->op_end());

	// If this node has constant inputs, then they'll need to be promoted here.
	for (unsigned i = 0; i < 2; ++i) {
	if (!isa<ConstantSDNode>(Ops[C+i]))
	continue;
	if (Ops[C+i].getValueType() == N->getValueType(0))
	continue;

	if (N->getOpcode() == ISD::SIGN_EXTEND)
	Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	else
	Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
	}

	// If we've promoted the comparison inputs of a SELECT or SELECT_CC,
	// truncate them again to the original value type.
	if (PromOp.getOpcode() == ISD::SELECT \|\|
	PromOp.getOpcode() == ISD::SELECT_CC) {
	auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
	if (SI0 != SelectTruncOp[0].end())
	Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
	auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
	if (SI1 != SelectTruncOp[1].end())
	Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
	}

	DAG.ReplaceAllUsesOfValueWith(PromOp,
	DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
	}

	// Now we're left with the initial extension itself.
	if (!ReallyNeedsExt)
	return N->getOperand(0);

	// To zero extend, just mask off everything except for the first bit (in the
	// i1 case).
	if (N->getOpcode() == ISD::ZERO_EXTEND)
	return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
	DAG.getConstant(APInt::getLowBitsSet(
	N->getValueSizeInBits(0), PromBits),
	dl, N->getValueType(0)));

	assert(N->getOpcode() == ISD::SIGN_EXTEND &&
	"Invalid extension type");
	EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
	SDValue ShiftCst =
	DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
	return DAG.getNode(
	ISD::SRA, dl, N->getValueType(0),
	DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
	ShiftCst);
	}

	/// \brief Reduces the number of fp-to-int conversion when building a vector.
	///
	/// If this vector is built out of floating to integer conversions,
	/// transform it to a vector built out of floating point values followed by a
	/// single floating to integer conversion of the vector.
	/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
	/// becomes (fptosi (build_vector ($A, $B, ...)))
	SDValue PPCTargetLowering::
	combineElementTruncationToVectorTruncation(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	SDValue FirstInput = N->getOperand(0);
	assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
	"The input operand must be an fp-to-int conversion.");

	// This combine happens after legalization so the fp_to_[su]i nodes are
	// already converted to PPCSISD nodes.
	unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
	if (FirstConversion == PPCISD::FCTIDZ \|\|
	FirstConversion == PPCISD::FCTIDUZ \|\|
	FirstConversion == PPCISD::FCTIWZ \|\|
	FirstConversion == PPCISD::FCTIWUZ) {
	bool IsSplat = true;
	bool Is32Bit = FirstConversion == PPCISD::FCTIWZ \|\|
	FirstConversion == PPCISD::FCTIWUZ;
	EVT SrcVT = FirstInput.getOperand(0).getValueType();
	SmallVector<SDValue, 4> Ops;
	EVT TargetVT = N->getValueType(0);
	for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
	if (N->getOperand(i).getOpcode() != PPCISD::MFVSR)
	return SDValue();
	unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode();
	if (NextConversion != FirstConversion)
	return SDValue();
	if (N->getOperand(i) != FirstInput)
	IsSplat = false;
	}

	// If this is a splat, we leave it as-is since there will be only a single
	// fp-to-int conversion followed by a splat of the integer. This is better
	// for 32-bit and smaller ints and neutral for 64-bit ints.
	if (IsSplat)
	return SDValue();

	// Now that we know we have the right type of node, get its operands
	for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue In = N->getOperand(i).getOperand(0);
	// For 32-bit values, we need to add an FP_ROUND node.
	if (Is32Bit) {
	if (In.isUndef())
	Ops.push_back(DAG.getUNDEF(SrcVT));
	else {
	SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, In.getOperand(0),
	DAG.getIntPtrConstant(1, dl));
	Ops.push_back(Trunc);
	}
	} else
	Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
	}

	unsigned Opcode;
	if (FirstConversion == PPCISD::FCTIDZ \|\|
	FirstConversion == PPCISD::FCTIWZ)
	Opcode = ISD::FP_TO_SINT;
	else
	Opcode = ISD::FP_TO_UINT;

	EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
	SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
	return DAG.getNode(Opcode, dl, TargetVT, BV);
	}
	return SDValue();
	}

	/// \brief Reduce the number of loads when building a vector.
	///
	/// Building a vector out of multiple loads can be converted to a load
	/// of the vector type if the loads are consecutive. If the loads are
	/// consecutive but in descending order, a shuffle is added at the end
	/// to reorder the vector.
	static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SDLoc dl(N);
	bool InputsAreConsecutiveLoads = true;
	bool InputsAreReverseConsecutive = true;
	unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8;
	SDValue FirstInput = N->getOperand(0);
	bool IsRoundOfExtLoad = false;

	if (FirstInput.getOpcode() == ISD::FP_ROUND &&
	FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
	LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
	IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
	}
	// Not a build vector of (possibly fp_rounded) loads.
	if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD)
	return SDValue();

	for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
	// If any inputs are fp_round(extload), they all must be.
	if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
	return SDValue();

	SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
	N->getOperand(i);
	if (NextInput.getOpcode() != ISD::LOAD)
	return SDValue();

	SDValue PreviousInput =
	IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
	LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);

	// If any inputs are fp_round(extload), they all must be.
	if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
	return SDValue();

	if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
	InputsAreConsecutiveLoads = false;
	if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
	InputsAreReverseConsecutive = false;

	// Exit early if the loads are neither consecutive nor reverse consecutive.
	if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
	return SDValue();
	}

	assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
	"The loads cannot be both consecutive and reverse consecutive.");

	SDValue FirstLoadOp =
	IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
	SDValue LastLoadOp =
	IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
	N->getOperand(N->getNumOperands()-1);

	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
	LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
	if (InputsAreConsecutiveLoads) {
	assert(LD1 && "Input needs to be a LoadSDNode.");
	return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
	LD1->getBasePtr(), LD1->getPointerInfo(),
	LD1->getAlignment());
	}
	if (InputsAreReverseConsecutive) {
	assert(LDL && "Input needs to be a LoadSDNode.");
	SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
	LDL->getBasePtr(), LDL->getPointerInfo(),
	LDL->getAlignment());
	SmallVector<int, 16> Ops;
	for (int i = N->getNumOperands() - 1; i >= 0; i--)
	Ops.push_back(i);

	return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
	DAG.getUNDEF(N->getValueType(0)), Ops);
	}
	return SDValue();
	}

	// This function adds the required vector_shuffle needed to get
	// the elements of the vector extract in the correct position
	// as specified by the CorrectElems encoding.
	static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
	SDValue Input, uint64_t Elems,
	uint64_t CorrectElems) {
	SDLoc dl(N);

	unsigned NumElems = Input.getValueType().getVectorNumElements();
	SmallVector<int, 16> ShuffleMask(NumElems, -1);

	// Knowing the element indices being extracted from the original
	// vector and the order in which they're being inserted, just put
	// them at element indices required for the instruction.
	for (unsigned i = 0; i < N->getNumOperands(); i++) {
	if (DAG.getDataLayout().isLittleEndian())
	ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
	else
	ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
	CorrectElems = CorrectElems >> 8;
	Elems = Elems >> 8;
	}

	SDValue Shuffle =
	DAG.getVectorShuffle(Input.getValueType(), dl, Input,
	DAG.getUNDEF(Input.getValueType()), ShuffleMask);

	EVT Ty = N->getValueType(0);
	SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
	return BV;
	}

	// Look for build vector patterns where input operands come from sign
	// extended vector_extract elements of specific indices. If the correct indices
	// aren't used, add a vector shuffle to fix up the indices and create a new
	// PPCISD:SExtVElems node which selects the vector sign extend instructions
	// during instruction selection.
	static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
	// This array encodes the indices that the vector sign extend instructions
	// extract from when extending from one type to another for both BE and LE.
	// The right nibble of each byte corresponds to the LE incides.
	// and the left nibble of each byte corresponds to the BE incides.
	// For example: 0x3074B8FC byte->word
	// For LE: the allowed indices are: 0x0,0x4,0x8,0xC
	// For BE: the allowed indices are: 0x3,0x7,0xB,0xF
	// For example: 0x000070F8 byte->double word
	// For LE: the allowed indices are: 0x0,0x8
	// For BE: the allowed indices are: 0x7,0xF
	uint64_t TargetElems[] = {
	0x3074B8FC, // b->w
	0x000070F8, // b->d
	0x10325476, // h->w
	0x00003074, // h->d
	0x00001032, // w->d
	};

	uint64_t Elems = 0;
	int Index;
	SDValue Input;

	auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
	if (!Op)
	return false;
	if (Op.getOpcode() != ISD::SIGN_EXTEND)
	return false;

	SDValue Extract = Op.getOperand(0);
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
	if (!ExtOp)
	return false;

	Index = ExtOp->getZExtValue();
	if (Input && Input != Extract.getOperand(0))
	return false;

	if (!Input)
	Input = Extract.getOperand(0);

	Elems = Elems << 8;
	Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
	Elems \|= Index;

	return true;
	};

	// If the build vector operands aren't sign extended vector extracts,
	// of the same input vector, then return.
	for (unsigned i = 0; i < N->getNumOperands(); i++) {
	if (!isSExtOfVecExtract(N->getOperand(i))) {
	return SDValue();
	}
	}

	// If the vector extract indicies are not correct, add the appropriate
	// vector_shuffle.
	int TgtElemArrayIdx;
	int InputSize = Input.getValueType().getScalarSizeInBits();
	int OutputSize = N->getValueType(0).getScalarSizeInBits();
	if (InputSize + OutputSize == 40)
	TgtElemArrayIdx = 0;
	else if (InputSize + OutputSize == 72)
	TgtElemArrayIdx = 1;
	else if (InputSize + OutputSize == 48)
	TgtElemArrayIdx = 2;
	else if (InputSize + OutputSize == 80)
	TgtElemArrayIdx = 3;
	else if (InputSize + OutputSize == 96)
	TgtElemArrayIdx = 4;
	else
	return SDValue();

	uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
	CorrectElems = DAG.getDataLayout().isLittleEndian()
	? CorrectElems & 0x0F0F0F0F0F0F0F0F
	: CorrectElems & 0xF0F0F0F0F0F0F0F0;
	if (Elems != CorrectElems) {
	return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
	}

	// Regular lowering will catch cases where a shuffle is not needed.
	return SDValue();
	}

	SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert(N->getOpcode() == ISD::BUILD_VECTOR &&
	"Should be called with a BUILD_VECTOR node");

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);

	if (!Subtarget.hasVSX())
	return SDValue();

	// The target independent DAG combiner will leave a build_vector of
	// float-to-int conversions intact. We can generate MUCH better code for
	// a float-to-int conversion of a vector of floats.
	SDValue FirstInput = N->getOperand(0);
	if (FirstInput.getOpcode() == PPCISD::MFVSR) {
	SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
	if (Reduced)
	return Reduced;
	}

	// If we're building a vector out of consecutive loads, just load that
	// vector type.
	SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
	if (Reduced)
	return Reduced;

	// If we're building a vector out of extended elements from another vector
	// we have P9 vector integer extend instructions.
	if (Subtarget.hasP9Altivec()) {
	Reduced = combineBVOfVecSExt(N, DAG);
	if (Reduced)
	return Reduced;
	}


	if (N->getValueType(0) != MVT::v2f64)
	return SDValue();

	// Looking for:
	// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
	if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
	FirstInput.getOpcode() != ISD::UINT_TO_FP)
	return SDValue();
	if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
	N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
	return SDValue();
	if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
	return SDValue();

	SDValue Ext1 = FirstInput.getOperand(0);
	SDValue Ext2 = N->getOperand(1).getOperand(0);
	if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
	ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
	if (!Ext1Op \|\| !Ext2Op)
	return SDValue();
	if (Ext1.getValueType() != MVT::i32 \|\|
	Ext2.getValueType() != MVT::i32)
	if (Ext1.getOperand(0) != Ext2.getOperand(0))
	return SDValue();

	int FirstElem = Ext1Op->getZExtValue();
	int SecondElem = Ext2Op->getZExtValue();
	int SubvecIdx;
	if (FirstElem == 0 && SecondElem == 1)
	SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
	else if (FirstElem == 2 && SecondElem == 3)
	SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
	else
	return SDValue();

	SDValue SrcVec = Ext1.getOperand(0);
	auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
	PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
	return DAG.getNode(NodeType, dl, MVT::v2f64,
	SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
	}

	SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
	DAGCombinerInfo &DCI) const {
	assert((N->getOpcode() == ISD::SINT_TO_FP \|\|
	N->getOpcode() == ISD::UINT_TO_FP) &&
	"Need an int -> FP conversion node here");

	if (useSoftFloat() \|\| !Subtarget.has64BitSupport())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Op(N, 0);

	// Don't handle ppc_fp128 here or i1 conversions.
	if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
	return SDValue();
	if (Op.getOperand(0).getValueType() == MVT::i1)
	return SDValue();

	SDValue FirstOperand(Op.getOperand(0));
	bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
	(FirstOperand.getValueType() == MVT::i8 \|\|
	FirstOperand.getValueType() == MVT::i16);
	if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
	bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
	bool DstDouble = Op.getValueType() == MVT::f64;
	unsigned ConvOp = Signed ?
	(DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
	(DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
	SDValue WidthConst =
	DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
	dl, false);
	LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
	SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
	SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
	DAG.getVTList(MVT::f64, MVT::Other),
	Ops, MVT::i8, LDN->getMemOperand());

	// For signed conversion, we need to sign-extend the value in the VSR
	if (Signed) {
	SDValue ExtOps[] = { Ld, WidthConst };
	SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
	return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
	} else
	return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
	}


	// For i32 intermediate values, unfortunately, the conversion functions
	// leave the upper 32 bits of the value are undefined. Within the set of
	// scalar instructions, we have no method for zero- or sign-extending the
	// value. Thus, we cannot handle i32 intermediate values here.
	if (Op.getOperand(0).getValueType() == MVT::i32)
	return SDValue();

	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\| Subtarget.hasFPCVT()) &&
	"UINT_TO_FP is supported only with FPCVT");

	// If we have FCFIDS, then use it when converting to single-precision.
	// Otherwise, convert to double-precision and then round.
	unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
	: PPCISD::FCFIDS)
	: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
	: PPCISD::FCFID);
	MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
	? MVT::f32
	: MVT::f64;

	// If we're converting from a float, to an int, and back to a float again,
	// then we don't need the store/load pair at all.
	if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
	Subtarget.hasFPCVT()) \|\|
	(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
	SDValue Src = Op.getOperand(0).getOperand(0);
	if (Src.getValueType() == MVT::f32) {
	Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
	DCI.AddToWorklist(Src.getNode());
	} else if (Src.getValueType() != MVT::f64) {
	// Make sure that we don't pick up a ppc_fp128 source value.
	return SDValue();
	}

	unsigned FCTOp =
	Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
	PPCISD::FCTIDUZ;

	SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
	SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);

	if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
	FP = DAG.getNode(ISD::FP_ROUND, dl,
	MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
	DCI.AddToWorklist(FP.getNode());
	}

	return FP;
	}

	return SDValue();
	}

	// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
	// builtins) into loads with swaps.
	SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Chain;
	SDValue Base;
	MachineMemOperand *MMO;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Unexpected opcode for little endian VSX load");
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	Chain = LD->getChain();
	Base = LD->getBasePtr();
	MMO = LD->getMemOperand();
	// If the MMO suggests this isn't a load of a full vector, leave
	// things alone. For a built-in, we have to make the change for
	// correctness, so if there is a size problem that will be a bug.
	if (MMO->getSize() < 16)
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
	Chain = Intrin->getChain();
	// Similarly to the store case below, Intrin->getBasePtr() doesn't get
	// us what we want. Get operand 2 instead.
	Base = Intrin->getOperand(2);
	MMO = Intrin->getMemOperand();
	break;
	}
	}

	MVT VecTy = N->getValueType(0).getSimpleVT();

	// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
	// aligned and the type is a vector with elements up to 4 bytes
	if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
	&& VecTy.getScalarSizeInBits() <= 32 ) {
	return SDValue();
	}

	SDValue LoadOps[] = { Chain, Base };
	SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
	DAG.getVTList(MVT::v2f64, MVT::Other),
	LoadOps, MVT::v2f64, MMO);

	DCI.AddToWorklist(Load.getNode());
	Chain = Load.getValue(1);
	SDValue Swap = DAG.getNode(
	PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
	DCI.AddToWorklist(Swap.getNode());

	// Add a bitcast if the resulting load type doesn't match v2f64.
	if (VecTy != MVT::v2f64) {
	SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
	DCI.AddToWorklist(N.getNode());
	// Package {bitcast value, swap's chain} to match Load's shape.
	return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
	N, Swap.getValue(1));
	}

	return Swap;
	}

	// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
	// builtins) into stores with swaps.
	SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue Chain;
	SDValue Base;
	unsigned SrcOpnd;
	MachineMemOperand *MMO;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Unexpected opcode for little endian VSX store");
	case ISD::STORE: {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	Chain = ST->getChain();
	Base = ST->getBasePtr();
	MMO = ST->getMemOperand();
	SrcOpnd = 1;
	// If the MMO suggests this isn't a store of a full vector, leave
	// things alone. For a built-in, we have to make the change for
	// correctness, so if there is a size problem that will be a bug.
	if (MMO->getSize() < 16)
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_VOID: {
	MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
	Chain = Intrin->getChain();
	// Intrin->getBasePtr() oddly does not get what we want.
	Base = Intrin->getOperand(3);
	MMO = Intrin->getMemOperand();
	SrcOpnd = 2;
	break;
	}
	}

	SDValue Src = N->getOperand(SrcOpnd);
	MVT VecTy = Src.getValueType().getSimpleVT();

	// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
	// aligned and the type is a vector with elements up to 4 bytes
	if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
	&& VecTy.getScalarSizeInBits() <= 32 ) {
	return SDValue();
	}

	// All stores are done as v2f64 and possible bit cast.
	if (VecTy != MVT::v2f64) {
	Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
	DCI.AddToWorklist(Src.getNode());
	}

	SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
	DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
	DCI.AddToWorklist(Swap.getNode());
	Chain = Swap.getValue(1);
	SDValue StoreOps[] = { Chain, Swap, Base };
	SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
	DAG.getVTList(MVT::Other),
	StoreOps, VecTy, MMO);
	DCI.AddToWorklist(Store.getNode());
	return Store;
	}

	SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default: break;
	case ISD::SHL:
	return combineSHL(N, DCI);
	case ISD::SRA:
	return combineSRA(N, DCI);
	case ISD::SRL:
	return combineSRL(N, DCI);
	case PPCISD::SHL:
	if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
	return N->getOperand(0);
	break;
	case PPCISD::SRL:
	if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
	return N->getOperand(0);
	break;
	case PPCISD::SRA:
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
	if (C->isNullValue() \|\| // 0 >>s V -> 0.
	C->isAllOnesValue()) // -1 >>s V -> -1.
	return N->getOperand(0);
	}
	break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	return DAGCombineExtBoolTrunc(N, DCI);
	case ISD::TRUNCATE:
	case ISD::SETCC:
	case ISD::SELECT_CC:
	return DAGCombineTruncBoolExt(N, DCI);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return combineFPToIntToFP(N, DCI);
	case ISD::STORE: {
	EVT Op1VT = N->getOperand(1).getValueType();
	bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) \|\|
	(Subtarget.hasP9Vector() && (Op1VT == MVT::i8 \|\| Op1VT == MVT::i16));

	// Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
	if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
	N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
	ValidTypeForStoreFltAsInt &&
	N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
	SDValue Val = N->getOperand(1).getOperand(0);
	if (Val.getValueType() == MVT::f32) {
	Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
	DCI.AddToWorklist(Val.getNode());
	}
	Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
	DCI.AddToWorklist(Val.getNode());

	if (Op1VT == MVT::i32) {
	SDValue Ops[] = {
	N->getOperand(0), Val, N->getOperand(2),
	DAG.getValueType(N->getOperand(1).getValueType())
	};

	Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
	DAG.getVTList(MVT::Other), Ops,
	cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());
	} else {
	unsigned WidthInBytes =
	N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2;
	SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false);

	SDValue Ops[] = {
	N->getOperand(0), Val, N->getOperand(2), WidthConst,
	DAG.getValueType(N->getOperand(1).getValueType())
	};
	Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl,
	DAG.getVTList(MVT::Other), Ops,
	cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());
	}

	DCI.AddToWorklist(Val.getNode());
	return Val;
	}

	// Turn STORE (BSWAP) -> sthbrx/stwbrx.
	if (cast<StoreSDNode>(N)->isUnindexed() &&
	N->getOperand(1).getOpcode() == ISD::BSWAP &&
	N->getOperand(1).getNode()->hasOneUse() &&
	(N->getOperand(1).getValueType() == MVT::i32 \|\|
	N->getOperand(1).getValueType() == MVT::i16 \|\|
	(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
	N->getOperand(1).getValueType() == MVT::i64))) {
	SDValue BSwapOp = N->getOperand(1).getOperand(0);
	// Do an any-extend to 32-bits if this is a half-word input.
	if (BSwapOp.getValueType() == MVT::i16)
	BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);

	// If the type of BSWAP operand is wider than stored memory width
	// it need to be shifted to the right side before STBRX.
	EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
	if (Op1VT.bitsGT(mVT)) {
	int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
	BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
	DAG.getConstant(Shift, dl, MVT::i32));
	// Need to truncate if this is a bswap of i64 stored as i32/i16.
	if (Op1VT == MVT::i64)
	BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
	}

	SDValue Ops[] = {
	N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
	};
	return
	DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
	Ops, cast<StoreSDNode>(N)->getMemoryVT(),
	cast<StoreSDNode>(N)->getMemOperand());
	}

	// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
	// So it can increase the chance of CSE constant construction.
	EVT VT = N->getOperand(1).getValueType();
	if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
	isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) {
	// Need to sign-extended to 64-bits to handle negative values.
	EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
	uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
	MemVT.getSizeInBits());
	SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);

	// DAG.getTruncStore() can't be used here because it doesn't accept
	// the general (base + offset) addressing mode.
	// So we use UpdateNodeOperands and setTruncatingStore instead.
	DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
	N->getOperand(3));
	cast<StoreSDNode>(N)->setTruncatingStore(true);
	return SDValue(N, 0);
	}

	// For little endian, VSX stores require generating xxswapd/lxvd2x.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
	if (VT.isSimple()) {
	MVT StoreVT = VT.getSimpleVT();
	if (Subtarget.needsSwapsForVSXMemOps() &&
	(StoreVT == MVT::v2f64 \|\| StoreVT == MVT::v2i64 \|\|
	StoreVT == MVT::v4f32 \|\| StoreVT == MVT::v4i32))
	return expandVSXStoreForLE(N, DCI);
	}
	break;
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	EVT VT = LD->getValueType(0);

	// For little endian, VSX loads require generating lxvd2x/xxswapd.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
	if (VT.isSimple()) {
	MVT LoadVT = VT.getSimpleVT();
	if (Subtarget.needsSwapsForVSXMemOps() &&
	(LoadVT == MVT::v2f64 \|\| LoadVT == MVT::v2i64 \|\|
	LoadVT == MVT::v4f32 \|\| LoadVT == MVT::v4i32))
	return expandVSXLoadForLE(N, DCI);
	}

	// We sometimes end up with a 64-bit integer load, from which we extract
	// two single-precision floating-point numbers. This happens with
	// std::complex<float>, and other similar structures, because of the way we
	// canonicalize structure copies. However, if we lack direct moves,
	// then the final bitcasts from the extracted integer values to the
	// floating-point numbers turn into store/load pairs. Even with direct moves,
	// just loading the two floating-point numbers is likely better.
	auto ReplaceTwoFloatLoad = [&]() {
	if (VT != MVT::i64)
	return false;

	if (LD->getExtensionType() != ISD::NON_EXTLOAD \|\|
	LD->isVolatile())
	return false;

	// We're looking for a sequence like this:
	// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
	// t16: i64 = srl t13, Constant:i32<32>
	// t17: i32 = truncate t16
	// t18: f32 = bitcast t17
	// t19: i32 = truncate t13
	// t20: f32 = bitcast t19

	if (!LD->hasNUsesOfValue(2, 0))
	return false;

	auto UI = LD->use_begin();
	while (UI.getUse().getResNo() != 0) ++UI;
	SDNode Trunc = UI++;
	while (UI.getUse().getResNo() != 0) ++UI;
	SDNode RightShift = UI;
	if (Trunc->getOpcode() != ISD::TRUNCATE)
	std::swap(Trunc, RightShift);

	if (Trunc->getOpcode() != ISD::TRUNCATE \|\|
	Trunc->getValueType(0) != MVT::i32 \|\|
	!Trunc->hasOneUse())
	return false;
	if (RightShift->getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(RightShift->getOperand(1)) \|\|
	RightShift->getConstantOperandVal(1) != 32 \|\|
	!RightShift->hasOneUse())
	return false;

	SDNode Trunc2 = RightShift->use_begin();
	if (Trunc2->getOpcode() != ISD::TRUNCATE \|\|
	Trunc2->getValueType(0) != MVT::i32 \|\|
	!Trunc2->hasOneUse())
	return false;

	SDNode Bitcast = Trunc->use_begin();
	SDNode Bitcast2 = Trunc2->use_begin();

	if (Bitcast->getOpcode() != ISD::BITCAST \|\|
	Bitcast->getValueType(0) != MVT::f32)
	return false;
	if (Bitcast2->getOpcode() != ISD::BITCAST \|\|
	Bitcast2->getValueType(0) != MVT::f32)
	return false;

	if (Subtarget.isLittleEndian())
	std::swap(Bitcast, Bitcast2);

	// Bitcast has the second float (in memory-layout order) and Bitcast2
	// has the first one.

	SDValue BasePtr = LD->getBasePtr();
	if (LD->isIndexed()) {
	assert(LD->getAddressingMode() == ISD::PRE_INC &&
	"Non-pre-inc AM on PPC?");
	BasePtr =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	LD->getOffset());
	}

	auto MMOFlags =
	LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
	SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
	LD->getPointerInfo(), LD->getAlignment(),
	MMOFlags, LD->getAAInfo());
	SDValue AddPtr =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
	BasePtr, DAG.getIntPtrConstant(4, dl));
	SDValue FloatLoad2 = DAG.getLoad(
	MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
	LD->getPointerInfo().getWithOffset(4),
	MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());

	if (LD->isIndexed()) {
	// Note that DAGCombine should re-form any pre-increment load(s) from
	// what is produced here if that makes sense.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
	}

	DCI.CombineTo(Bitcast2, FloatLoad);
	DCI.CombineTo(Bitcast, FloatLoad2);

	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
	SDValue(FloatLoad2.getNode(), 1));
	return true;
	};

	if (ReplaceTwoFloatLoad())
	return SDValue(N, 0);

	EVT MemVT = LD->getMemoryVT();
	Type Ty = MemVT.getTypeForEVT(DAG.getContext());
	unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
	Type STy = MemVT.getScalarType().getTypeForEVT(DAG.getContext());
	unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
	if (LD->isUnindexed() && VT.isVector() &&
	((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
	// P8 and later hardware should just use LOAD.
	!Subtarget.hasP8Vector() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasQPX() && (VT == MVT::v4f64 \|\| VT == MVT::v4f32) &&
	LD->getAlignment() >= ScalarABIAlignment)) &&
	LD->getAlignment() < ABIAlignment) {
	// This is a type-legal unaligned Altivec or QPX load.
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();
	bool isLittleEndian = Subtarget.isLittleEndian();

	// This implements the loading of unaligned vectors as described in
	// the venerable Apple Velocity Engine overview. Specifically:
	// https://developer.apple.com/hardwaredrivers/ve/alignment.html
	// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
	//
	// The general idea is to expand a sequence of one or more unaligned
	// loads into an alignment-based permutation-control instruction (lvsl
	// or lvsr), a series of regular vector loads (which always truncate
	// their input address to an aligned address), and a series of
	// permutations. The results of these permutations are the requested
	// loaded values. The trick is that the last "extra" load is not taken
	// from the address you might suspect (sizeof(vector) bytes after the
	// last requested load), but rather sizeof(vector) - 1 bytes after the
	// last requested vector. The point of this is to avoid a page fault if
	// the base address happened to be aligned. This works because if the
	// base address is aligned, then adding less than a full vector length
	// will cause the last vector in the sequence to be (re)loaded.
	// Otherwise, the next vector will be fetched as you might suspect was
	// necessary.

	// We might be able to reuse the permutation generation from
	// a different base address offset from this one by an aligned amount.
	// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
	// optimization later.
	Intrinsic::ID Intr, IntrLD, IntrPerm;
	MVT PermCntlTy, PermTy, LDTy;
	if (Subtarget.hasAltivec()) {
	Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
	Intrinsic::ppc_altivec_lvsl;
	IntrLD = Intrinsic::ppc_altivec_lvx;
	IntrPerm = Intrinsic::ppc_altivec_vperm;
	PermCntlTy = MVT::v16i8;
	PermTy = MVT::v4i32;
	LDTy = MVT::v4i32;
	} else {
	Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
	Intrinsic::ppc_qpx_qvlpcls;
	IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
	Intrinsic::ppc_qpx_qvlfs;
	IntrPerm = Intrinsic::ppc_qpx_qvfperm;
	PermCntlTy = MVT::v4f64;
	PermTy = MVT::v4f64;
	LDTy = MemVT.getSimpleVT();
	}

	SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);

	// Create the new MMO for the new base load. It is like the original MMO,
	// but represents an area in memory almost twice the vector size centered
	// on the original address. If the address is unaligned, we might start
	// reading up to (sizeof(vector)-1) bytes below the address of the
	// original unaligned load.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *BaseMMO =
	MF.getMachineMemOperand(LD->getMemOperand(),
	-(long)MemVT.getStoreSize()+1,
	2*MemVT.getStoreSize()-1);

	// Create the new base load.
	SDValue LDXIntID =
	DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
	SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
	SDValue BaseLoad =
	DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
	DAG.getVTList(PermTy, MVT::Other),
	BaseLoadOps, LDTy, BaseMMO);

	// Note that the value of IncOffset (which is provided to the next
	// load's pointer info offset value, and thus used to calculate the
	// alignment), and the value of IncValue (which is actually used to
	// increment the pointer value) are different! This is because we
	// require the next load to appear to be aligned, even though it
	// is actually offset from the base pointer by a lesser amount.
	int IncOffset = VT.getSizeInBits() / 8;
	int IncValue = IncOffset;

	// Walk (both up and down) the chain looking for another load at the real
	// (aligned) offset (the alignment of the other load does not matter in
	// this case). If found, then do not use the offset reduction trick, as
	// that will prevent the loads from being later combined (as they would
	// otherwise be duplicates).
	if (!findConsecutiveLoad(LD, DAG))
	--IncValue;

	SDValue Increment =
	DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);

	MachineMemOperand *ExtraMMO =
	MF.getMachineMemOperand(LD->getMemOperand(),
	1, 2*MemVT.getStoreSize()-1);
	SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
	SDValue ExtraLoad =
	DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
	DAG.getVTList(PermTy, MVT::Other),
	ExtraLoadOps, LDTy, ExtraMMO);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	BaseLoad.getValue(1), ExtraLoad.getValue(1));

	// Because vperm has a big-endian bias, we must reverse the order
	// of the input vectors and complement the permute control vector
	// when generating little endian code. We have already handled the
	// latter by using lvsr instead of lvsl, so just reverse BaseLoad
	// and ExtraLoad here.
	SDValue Perm;
	if (isLittleEndian)
	Perm = BuildIntrinsicOp(IntrPerm,
	ExtraLoad, BaseLoad, PermCntl, DAG, dl);
	else
	Perm = BuildIntrinsicOp(IntrPerm,
	BaseLoad, ExtraLoad, PermCntl, DAG, dl);

	if (VT != PermTy)
	Perm = Subtarget.hasAltivec() ?
	DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
	DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
	DAG.getTargetConstant(1, dl, MVT::i64));
	// second argument is 1 because this rounding
	// is always exact.

	// The output of the permutation is our loaded result, the TokenFactor is
	// our new chain.
	DCI.CombineTo(N, Perm, TF);
	return SDValue(N, 0);
	}
	}
	break;
	case ISD::INTRINSIC_WO_CHAIN: {
	bool isLittleEndian = Subtarget.isLittleEndian();
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
	: Intrinsic::ppc_altivec_lvsl);
	if ((IID == Intr \|\|
	IID == Intrinsic::ppc_qpx_qvlpcld \|\|
	IID == Intrinsic::ppc_qpx_qvlpcls) &&
	N->getOperand(1)->getOpcode() == ISD::ADD) {
	SDValue Add = N->getOperand(1);

	int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
	5 /* 32 byte alignment / : 4 / 16 byte alignment */;

	if (DAG.MaskedValueIsZero(Add->getOperand(1),
	APInt::getAllOnesValue(Bits /* alignment */)
	.zext(Add.getScalarValueSizeInBits()))) {
	SDNode *BasePtr = Add->getOperand(0).getNode();
	for (SDNode::use_iterator UI = BasePtr->use_begin(),
	UE = BasePtr->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
	// We've found another LVSL/LVSR, and this address is an aligned
	// multiple of that one. The results will be the same, so use the
	// one we've just found instead.

	return SDValue(*UI, 0);
	}
	}
	}

	if (isa<ConstantSDNode>(Add->getOperand(1))) {
	SDNode *BasePtr = Add->getOperand(0).getNode();
	for (SDNode::use_iterator UI = BasePtr->use_begin(),
	UE = BasePtr->use_end(); UI != UE; ++UI) {
	if (UI->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(UI->getOperand(1)) &&
	(cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
	cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
	(1ULL << Bits) == 0) {
	SDNode OtherAdd = UI;
	for (SDNode::use_iterator VI = OtherAdd->use_begin(),
	VE = OtherAdd->use_end(); VI != VE; ++VI) {
	if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
	return SDValue(*VI, 0);
	}
	}
	}
	}
	}
	}
	}

	break;
	case ISD::INTRINSIC_W_CHAIN:
	// For little endian, VSX loads require generating lxvd2x/xxswapd.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
	if (Subtarget.needsSwapsForVSXMemOps()) {
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default:
	break;
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvd2x:
	return expandVSXLoadForLE(N, DCI);
	}
	}
	break;
	case ISD::INTRINSIC_VOID:
	// For little endian, VSX stores require generating xxswapd/stxvd2x.
	// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
	if (Subtarget.needsSwapsForVSXMemOps()) {
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	default:
	break;
	case Intrinsic::ppc_vsx_stxvw4x:
	case Intrinsic::ppc_vsx_stxvd2x:
	return expandVSXStoreForLE(N, DCI);
	}
	}
	break;
	case ISD::BSWAP:
	// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
	if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
	N->getOperand(0).hasOneUse() &&
	(N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i16 \|\|
	(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
	N->getValueType(0) == MVT::i64))) {
	SDValue Load = N->getOperand(0);
	LoadSDNode *LD = cast<LoadSDNode>(Load);
	// Create the byte-swapping load.
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr(), // Ptr
	DAG.getValueType(N->getValueType(0)) // VT
	};
	SDValue BSLoad =
	DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
	DAG.getVTList(N->getValueType(0) == MVT::i64 ?
	MVT::i64 : MVT::i32, MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());

	// If this is an i16 load, insert the truncate.
	SDValue ResVal = BSLoad;
	if (N->getValueType(0) == MVT::i16)
	ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);

	// First, combine the bswap away. This makes the value produced by the
	// load dead.
	DCI.CombineTo(N, ResVal);

	// Next, combine the load away, we give it a bogus result value but a real
	// chain result. The result value is dead because the bswap is dead.
	DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));

	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	break;
	case PPCISD::VCMP:
	// If a VCMPo node already exists with exactly the same operands as this
	// node, use its result instead of this node (VCMPo computes both a CR6 and
	// a normal output).
	//
	if (!N->getOperand(0).hasOneUse() &&
	!N->getOperand(1).hasOneUse() &&
	!N->getOperand(2).hasOneUse()) {

	// Scan all of the users of the LHS, looking for VCMPo's that match.
	SDNode *VCMPoNode = nullptr;

	SDNode *LHSN = N->getOperand(0).getNode();
	for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
	UI != E; ++UI)
	if (UI->getOpcode() == PPCISD::VCMPo &&
	UI->getOperand(1) == N->getOperand(1) &&
	UI->getOperand(2) == N->getOperand(2) &&
	UI->getOperand(0) == N->getOperand(0)) {
	VCMPoNode = *UI;
	break;
	}

	// If there is no VCMPo node, or if the flag value has a single use, don't
	// transform this.
	if (!VCMPoNode \|\| VCMPoNode->hasNUsesOfValue(0, 1))
	break;

	// Look at the (necessarily single) use of the flag value. If it has a
	// chain, this transformation is more complex. Note that multiple things
	// could use the value result, which we should ignore.
	SDNode *FlagUser = nullptr;
	for (SDNode::use_iterator UI = VCMPoNode->use_begin();
	FlagUser == nullptr; ++UI) {
	assert(UI != VCMPoNode->use_end() && "Didn't find user!");
	SDNode User = UI;
	for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
	if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
	FlagUser = User;
	break;
	}
	}
	}

	// If the user is a MFOCRF instruction, we know this is safe.
	// Otherwise we give up for right now.
	if (FlagUser->getOpcode() == PPCISD::MFOCRF)
	return SDValue(VCMPoNode, 0);
	}
	break;
	case ISD::BRCOND: {
	SDValue Cond = N->getOperand(1);
	SDValue Target = N->getOperand(2);

	if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
	Intrinsic::ppc_is_decremented_ctr_nonzero) {

	// We now need to make the intrinsic dead (it cannot be instruction
	// selected).
	DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
	assert(Cond.getNode()->hasOneUse() &&
	"Counter decrement has more than one use");

	return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
	N->getOperand(0), Target);
	}
	}
	break;
	case ISD::BR_CC: {
	// If this is a branch on an altivec predicate comparison, lower this so
	// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
	// lowering is done pre-legalize, because the legalizer lowers the predicate
	// compare down to code that is difficult to reassemble.
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
	SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);

	// Sometimes the promoted value of the intrinsic is ANDed by some non-zero
	// value. If so, pass-through the AND to get to the intrinsic.
	if (LHS.getOpcode() == ISD::AND &&
	LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
	Intrinsic::ppc_is_decremented_ctr_nonzero &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	!isNullConstant(LHS.getOperand(1)))
	LHS = LHS.getOperand(0);

	if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
	Intrinsic::ppc_is_decremented_ctr_nonzero &&
	isa<ConstantSDNode>(RHS)) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	"Counter decrement comparison is not EQ or NE");

	unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
	bool isBDNZ = (CC == ISD::SETEQ && Val) \|\|
	(CC == ISD::SETNE && !Val);

	// We now need to make the intrinsic dead (it cannot be instruction
	// selected).
	DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
	assert(LHS.getNode()->hasOneUse() &&
	"Counter decrement has more than one use");

	return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
	N->getOperand(0), N->getOperand(4));
	}

	int CompareOpc;
	bool isDot;

	if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
	assert(isDot && "Can't compare against a vector result!");

	// If this is a comparison against something other than 0/1, then we know
	// that the condition is never/always true.
	unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (Val != 0 && Val != 1) {
	if (CC == ISD::SETEQ) // Cond never true, remove branch.
	return N->getOperand(0);
	// Always !=, turn it into an unconditional branch.
	return DAG.getNode(ISD::BR, dl, MVT::Other,
	N->getOperand(0), N->getOperand(4));
	}

	bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);

	// Create the PPCISD altivec 'dot' comparison node.
	SDValue Ops[] = {
	LHS.getOperand(2), // LHS of compare
	LHS.getOperand(3), // RHS of compare
	DAG.getConstant(CompareOpc, dl, MVT::i32)
	};
	EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
	SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);

	// Unpack the result based on how the target uses it.
	PPC::Predicate CompOpc;
	switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
	default: // Can't happen, don't crash on invalid number though.
	case 0: // Branch on the value of the EQ bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
	break;
	case 1: // Branch on the inverted value of the EQ bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
	break;
	case 2: // Branch on the value of the LT bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
	break;
	case 3: // Branch on the inverted value of the LT bit of CR6.
	CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
	break;
	}

	return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
	DAG.getConstant(CompOpc, dl, MVT::i32),
	DAG.getRegister(PPC::CR6, MVT::i32),
	N->getOperand(4), CompNode.getValue(1));
	}
	break;
	}
	case ISD::BUILD_VECTOR:
	return DAGCombineBuildVector(N, DCI);
	}

	return SDValue();
	}

	SDValue
	PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	std::vector<SDNode > Created) const {
	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if (VT == MVT::i64 && !Subtarget.isPPC64())
	return SDValue();
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);

	bool IsNegPow2 = (-Divisor).isPowerOf2();
	unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
	SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);

	SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
	if (Created)
	Created->push_back(Op.getNode());

	if (IsNegPow2) {
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	if (Created)
	Created->push_back(Op.getNode());
	}

	return Op;
	}

	//===----------------------------------------------------------------------===//
	// Inline Assembly Support
	//===----------------------------------------------------------------------===//

	void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	Known.resetAll();
	switch (Op.getOpcode()) {
	default: break;
	case PPCISD::LBRX: {
	// lhbrx is known to have the top bits cleared out.
	if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
	Known.Zero = 0xFFFF0000;
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
	default: break;
	case Intrinsic::ppc_altivec_vcmpbfp_p:
	case Intrinsic::ppc_altivec_vcmpeqfp_p:
	case Intrinsic::ppc_altivec_vcmpequb_p:
	case Intrinsic::ppc_altivec_vcmpequh_p:
	case Intrinsic::ppc_altivec_vcmpequw_p:
	case Intrinsic::ppc_altivec_vcmpequd_p:
	case Intrinsic::ppc_altivec_vcmpgefp_p:
	case Intrinsic::ppc_altivec_vcmpgtfp_p:
	case Intrinsic::ppc_altivec_vcmpgtsb_p:
	case Intrinsic::ppc_altivec_vcmpgtsh_p:
	case Intrinsic::ppc_altivec_vcmpgtsw_p:
	case Intrinsic::ppc_altivec_vcmpgtsd_p:
	case Intrinsic::ppc_altivec_vcmpgtub_p:
	case Intrinsic::ppc_altivec_vcmpgtuh_p:
	case Intrinsic::ppc_altivec_vcmpgtuw_p:
	case Intrinsic::ppc_altivec_vcmpgtud_p:
	Known.Zero = ~1U; // All bits but the low one are known to be zero.
	break;
	}
	}
	}
	}

	unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
	switch (Subtarget.getDarwinDirective()) {
	default: break;
	case PPC::DIR_970:
	case PPC::DIR_PWR4:
	case PPC::DIR_PWR5:
	case PPC::DIR_PWR5X:
	case PPC::DIR_PWR6:
	case PPC::DIR_PWR6X:
	case PPC::DIR_PWR7:
	case PPC::DIR_PWR8:
	case PPC::DIR_PWR9: {
	if (!ML)
	break;

	const PPCInstrInfo *TII = Subtarget.getInstrInfo();

	// For small loops (between 5 and 8 instructions), align to a 32-byte
	// boundary so that the entire loop fits in one instruction-cache line.
	uint64_t LoopSize = 0;
	for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
	for (auto J = (I)->begin(), JE = (I)->end(); J != JE; ++J) {
	LoopSize += TII->getInstSizeInBytes(*J);
	if (LoopSize > 32)
	break;
	}

	if (LoopSize > 16 && LoopSize <= 32)
	return 5;

	break;
	}
	}

	return TargetLowering::getPrefLoopAlignment(ML);
	}

	/// getConstraintType - Given a constraint, return the type of
	/// constraint it is for this target.
	PPCTargetLowering::ConstraintType
	PPCTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default: break;
	case 'b':
	case 'r':
	case 'f':
	case 'd':
	case 'v':
	case 'y':
	return C_RegisterClass;
	case 'Z':
	// FIXME: While Z does indicate a memory constraint, it specifically
	// indicates an r+r address (used in conjunction with the 'y' modifier
	// in the replacement string). Currently, we're forcing the base
	// register to be r0 in the asm printer (which is interpreted as zero)
	// and forming the complete address in the second register. This is
	// suboptimal.
	return C_Memory;
	}
	} else if (Constraint == "wc") { // individual CR bits.
	return C_RegisterClass;
	} else if (Constraint == "wa" \|\| Constraint == "wd" \|\|
	Constraint == "wf" \|\| Constraint == "ws") {
	return C_RegisterClass; // VSX registers.
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	PPCTargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();

	// Look at the constraint type.
	if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
	return CW_Register; // an individual CR bit.
	else if ((StringRef(constraint) == "wa" \|\|
	StringRef(constraint) == "wd" \|\|
	StringRef(constraint) == "wf") &&
	type->isVectorTy())
	return CW_Register;
	else if (StringRef(constraint) == "ws" && type->isDoubleTy())
	return CW_Register;

	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'b':
	if (type->isIntegerTy())
	weight = CW_Register;
	break;
	case 'f':
	if (type->isFloatTy())
	weight = CW_Register;
	break;
	case 'd':
	if (type->isDoubleTy())
	weight = CW_Register;
	break;
	case 'v':
	if (type->isVectorTy())
	weight = CW_Register;
	break;
	case 'y':
	weight = CW_Register;
	break;
	case 'Z':
	weight = CW_Memory;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	if (Constraint.size() == 1) {
	// GCC RS6000 Constraint Letters
	switch (Constraint[0]) {
	case 'b': // R1-R31
	if (VT == MVT::i64 && Subtarget.isPPC64())
	return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
	return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
	case 'r': // R0-R31
	if (VT == MVT::i64 && Subtarget.isPPC64())
	return std::make_pair(0U, &PPC::G8RCRegClass);
	return std::make_pair(0U, &PPC::GPRCRegClass);
	// 'd' and 'f' constraints are both defined to be "the floating point
	// registers", where one is for 32-bit and the other for 64-bit. We don't
	// really care overly much here so just give them all the same reg classes.
	case 'd':
	case 'f':
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &PPC::F4RCRegClass);
	if (VT == MVT::f64 \|\| VT == MVT::i64)
	return std::make_pair(0U, &PPC::F8RCRegClass);
	if (VT == MVT::v4f64 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QFRCRegClass);
	if (VT == MVT::v4f32 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QSRCRegClass);
	break;
	case 'v':
	if (VT == MVT::v4f64 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QFRCRegClass);
	if (VT == MVT::v4f32 && Subtarget.hasQPX())
	return std::make_pair(0U, &PPC::QSRCRegClass);
	if (Subtarget.hasAltivec())
	return std::make_pair(0U, &PPC::VRRCRegClass);
	break;
	case 'y': // crrc
	return std::make_pair(0U, &PPC::CRRCRegClass);
	}
	} else if (Constraint == "wc" && Subtarget.useCRBits()) {
	// An individual CR bit.
	return std::make_pair(0U, &PPC::CRBITRCRegClass);
	} else if ((Constraint == "wa" \|\| Constraint == "wd" \|\|
	Constraint == "wf") && Subtarget.hasVSX()) {
	return std::make_pair(0U, &PPC::VSRCRegClass);
	} else if (Constraint == "ws" && Subtarget.hasVSX()) {
	if (VT == MVT::f32 && Subtarget.hasP8Vector())
	return std::make_pair(0U, &PPC::VSSRCRegClass);
	else
	return std::make_pair(0U, &PPC::VSFRCRegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *> R =
	TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
	// (which we call X[0-9]+). If a 64-bit value has been requested, and a
	// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
	// register.
	// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
	// the AsmName field from *RegisterInfo.td, then this would not be necessary.
	if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
	PPC::GPRCRegClass.contains(R.first))
	return std::make_pair(TRI->getMatchingSuperReg(R.first,
	PPC::sub_32, &PPC::G8RCRegClass),
	&PPC::G8RCRegClass);

	// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
	if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
	R.first = PPC::CR0;
	R.second = &PPC::CRRCRegClass;
	}

	return R;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints.
	if (Constraint.length() > 1) return;

	char Letter = Constraint[0];
	switch (Letter) {
	default: break;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P': {
	ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
	if (!CST) return; // Must be an immediate to match.
	SDLoc dl(Op);
	int64_t Value = CST->getSExtValue();
	EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
	// numbers are printed as such.
	switch (Letter) {
	default: llvm_unreachable("Unknown constraint letter!");
	case 'I': // "I" is a signed 16-bit constant.
	if (isInt<16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
	if (isShiftedUInt<16, 16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
	if (isShiftedInt<16, 16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
	if (isUInt<16>(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'M': // "M" is a constant that is greater than 31.
	if (Value > 31)
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'N': // "N" is a positive constant that is an exact power of two.
	if (Value > 0 && isPowerOf2_64(Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'O': // "O" is the constant zero.
	if (Value == 0)
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
	if (isInt<16>(-Value))
	Result = DAG.getTargetConstant(Value, dl, TCVT);
	break;
	}
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	// Handle standard constraint letters.
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	// isLegalAddressingMode - Return true if the addressing mode represented
	// by AM is legal for this target, for a load/store of the specified type.
	bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// PPC does not allow r+i addressing modes for vectors!
	if (Ty->isVectorTy() && AM.BaseOffs != 0)
	return false;

	// PPC allows a sign-extended 16-bit immediate field.
	if (AM.BaseOffs <= -(1LL << 16) \|\| AM.BaseOffs >= (1LL << 16)-1)
	return false;

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// PPC only support r+r,
	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
	return false;
	// Otherwise we have r+r or r+i.
	break;
	case 2:
	if (AM.HasBaseReg \|\| AM.BaseOffs) // 2r+r or 2r+i is not allowed.
	return false;
	// Allow 2*r as r+r.
	break;
	default:
	// No other scales are supported.
	return false;
	}

	return true;
	}

	SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	// Make sure the function does not optimize away the store of the RA to
	// the stack.
	PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
	FuncInfo->setLRStoreRequired();
	bool isPPC64 = Subtarget.isPPC64();
	auto PtrVT = getPointerTy(MF.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset =
	DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
	isPPC64 ? MVT::i64 : MVT::i32);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address off the stack.
	SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT PtrVT = getPointerTy(MF.getDataLayout());
	bool isPPC64 = PtrVT == MVT::i64;

	// Naked functions never have a frame pointer, and so we use r1. For all
	// other functions, this decision must be delayed until during PEI.
	unsigned FrameReg;
	if (MF.getFunction().hasFnAttribute(Attribute::Naked))
	FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
	else
	FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;

	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
	PtrVT);
	while (Depth--)
	FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
	FrameAddr, MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	bool isPPC64 = Subtarget.isPPC64();
	bool isDarwinABI = Subtarget.isDarwinABI();

	if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) \|\|
	(!isPPC64 && VT != MVT::i32))
	report_fatal_error("Invalid register global variable type");

	bool is64Bit = isPPC64 && VT == MVT::i64;
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
	.Case("r2", (isDarwinABI \|\| isPPC64) ? 0 : PPC::R2)
	.Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
	(is64Bit ? PPC::X13 : PPC::R13))
	.Default(0);

	if (Reg)
	return Reg;
	report_fatal_error("Invalid register name global variable");
	}

	bool
	PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// The PowerPC target isn't yet aware of offsets.
	return false;
	}

	bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvlfd:
	case Intrinsic::ppc_qpx_qvlfs:
	case Intrinsic::ppc_qpx_qvlfcd:
	case Intrinsic::ppc_qpx_qvlfcs:
	case Intrinsic::ppc_qpx_qvlfiwa:
	case Intrinsic::ppc_qpx_qvlfiwz:
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	case Intrinsic::ppc_altivec_lvebx:
	case Intrinsic::ppc_altivec_lvehx:
	case Intrinsic::ppc_altivec_lvewx:
	case Intrinsic::ppc_vsx_lxvd2x:
	case Intrinsic::ppc_vsx_lxvw4x: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_altivec_lvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_lvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_lvewx:
	VT = MVT::i32;
	break;
	case Intrinsic::ppc_vsx_lxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfd:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfs:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcd:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcs:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = -VT.getStoreSize()+1;
	Info.size = 2*VT.getStoreSize()-1;
	Info.align = 1;
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::ppc_qpx_qvlfda:
	case Intrinsic::ppc_qpx_qvlfsa:
	case Intrinsic::ppc_qpx_qvlfcda:
	case Intrinsic::ppc_qpx_qvlfcsa:
	case Intrinsic::ppc_qpx_qvlfiwaa:
	case Intrinsic::ppc_qpx_qvlfiwza: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvlfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvlfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvlfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvlfcsa:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.size = VT.getStoreSize();
	Info.align = 1;
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::ppc_qpx_qvstfd:
	case Intrinsic::ppc_qpx_qvstfs:
	case Intrinsic::ppc_qpx_qvstfcd:
	case Intrinsic::ppc_qpx_qvstfcs:
	case Intrinsic::ppc_qpx_qvstfiw:
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	case Intrinsic::ppc_altivec_stvebx:
	case Intrinsic::ppc_altivec_stvehx:
	case Intrinsic::ppc_altivec_stvewx:
	case Intrinsic::ppc_vsx_stxvd2x:
	case Intrinsic::ppc_vsx_stxvw4x: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_altivec_stvebx:
	VT = MVT::i8;
	break;
	case Intrinsic::ppc_altivec_stvehx:
	VT = MVT::i16;
	break;
	case Intrinsic::ppc_altivec_stvewx:
	VT = MVT::i32;
	break;
	case Intrinsic::ppc_vsx_stxvd2x:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfd:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcd:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcs:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = -VT.getStoreSize()+1;
	Info.size = 2*VT.getStoreSize()-1;
	Info.align = 1;
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::ppc_qpx_qvstfda:
	case Intrinsic::ppc_qpx_qvstfsa:
	case Intrinsic::ppc_qpx_qvstfcda:
	case Intrinsic::ppc_qpx_qvstfcsa:
	case Intrinsic::ppc_qpx_qvstfiwa: {
	EVT VT;
	switch (Intrinsic) {
	case Intrinsic::ppc_qpx_qvstfda:
	VT = MVT::v4f64;
	break;
	case Intrinsic::ppc_qpx_qvstfsa:
	VT = MVT::v4f32;
	break;
	case Intrinsic::ppc_qpx_qvstfcda:
	VT = MVT::v2f64;
	break;
	case Intrinsic::ppc_qpx_qvstfcsa:
	VT = MVT::v2f32;
	break;
	default:
	VT = MVT::v4i32;
	break;
	}

	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = VT;
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.size = VT.getStoreSize();
	Info.align = 1;
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	default:
	break;
	}

	return false;
	}

	/// getOptimalMemOpType - Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
	const Function &F = MF.getFunction();
	// When expanding a memset, require at least two QPX instructions to cover
	// the cost of loading the value to be stored from the constant pool.
	if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset \|\| Size >= 64) &&
	(!SrcAlign \|\| SrcAlign >= 32) && (!DstAlign \|\| DstAlign >= 32) &&
	!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
	return MVT::v4f64;
	}

	// We should use Altivec/VSX loads and stores when available. For unaligned
	// addresses, unaligned VSX loads are only fast starting with the P8.
	if (Subtarget.hasAltivec() && Size >= 16 &&
	(((!SrcAlign \|\| SrcAlign >= 16) && (!DstAlign \|\| DstAlign >= 16)) \|\|
	((IsMemset && Subtarget.hasVSX()) \|\| Subtarget.hasP8Vector())))
	return MVT::v4i32;
	}

	if (Subtarget.isPPC64()) {
	return MVT::i64;
	}

	return MVT::i32;
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	return !(BitSize == 0 \|\| BitSize > 64);
	}

	bool PPCTargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 64 && NumBits2 == 32;
	}

	bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 64 && NumBits2 == 32;
	}

	bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	// Generally speaking, zexts are not free, but they are free when they can be
	// folded with other operations.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
	EVT MemVT = LD->getMemoryVT();
	if ((MemVT == MVT::i1 \|\| MemVT == MVT::i8 \|\| MemVT == MVT::i16 \|\|
	(Subtarget.isPPC64() && MemVT == MVT::i32)) &&
	(LD->getExtensionType() == ISD::NON_EXTLOAD \|\|
	LD->getExtensionType() == ISD::ZEXTLOAD))
	return true;
	}

	// FIXME: Add other cases...
	// - 32-bit shifts with a zext to i64
	// - zext after ctlz, bswap, etc.
	// - zext after and by a constant mask

	return TargetLowering::isZExtFree(Val, VT2);
	}

	bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
	assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
	"invalid fpext types");
	return true;
	}

	bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<16>(Imm) \|\| isUInt<16>(Imm);
	}

	bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	return isInt<16>(Imm) \|\| isUInt<16>(Imm);
	}

	bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	bool *Fast) const {
	if (DisablePPCUnaligned)
	return false;

	// PowerPC supports unaligned memory access for simple non-vector types.
	// Although accessing unaligned addresses is not as efficient as accessing
	// aligned addresses, it is generally more efficient than manual expansion,
	// and generally only traps for software emulation when crossing page
	// boundaries.

	if (!VT.isSimple())
	return false;

	if (VT.getSimpleVT().isVector()) {
	if (Subtarget.hasVSX()) {
	if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
	VT != MVT::v4f32 && VT != MVT::v4i32)
	return false;
	} else {
	return false;
	}
	}

	if (VT == MVT::ppcf128)
	return false;

	if (Fast)
	*Fast = true;

	return true;
	}

	bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	const MCPhysReg *
	PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints. The same reasoning applies
	// to CTR, which is used by any indirect call.
	static const MCPhysReg ScratchRegs[] = {
	PPC::X12, PPC::LR8, PPC::CTR8, 0
	};

	return ScratchRegs;
	}

	unsigned PPCTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
	}

	unsigned PPCTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
	}

	bool
	PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
	EVT VT , unsigned DefinedValues) const {
	if (VT == MVT::v2i64)
	return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves

	if (Subtarget.hasVSX() \|\| Subtarget.hasQPX())
	return true;

	return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
	}

	Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
	if (DisableILPPref \|\| Subtarget.enableMachineScheduler())
	return TargetLowering::getSchedulingPreference(N);

	return Sched::ILP;
	}

	// Create a fast isel object.
	FastISel *
	PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo) const {
	return PPC::createFastISel(FuncInfo, LibInfo);
	}

	void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (Subtarget.isDarwinABI()) return;
	if (!Subtarget.isPPC64()) return;

	// Update IsSplitCSR in PPCFunctionInfo
	PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
	PFI->setIsSplitCSR(true);
	}

	void PPCTargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (PPC::G8RCRegClass.contains(*I))
	RC = &PPC::G8RCRegClass;
	else if (PPC::F8RCRegClass.contains(*I))
	RC = &PPC::F8RCRegClass;
	else if (PPC::CRRCRegClass.contains(*I))
	RC = &PPC::CRRCRegClass;
	else if (PPC::VRRCRegClass.contains(*I))
	RC = &PPC::VRRCRegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	// Override to enable LOAD_STACK_GUARD lowering on Linux.
	bool PPCTargetLowering::useLoadStackGuardNode() const {
	if (!Subtarget.isTargetLinux())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	// Override to disable global variable loading on Linux.
	void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
	if (!Subtarget.isTargetLinux())
	return TargetLowering::insertSSPDeclarations(M);
	}

	bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	if (!VT.isSimple() \|\| !Subtarget.hasVSX())
	return false;

	switch(VT.getSimpleVT().SimpleTy) {
	default:
	// For FP types that are currently not supported by PPC backend, return
	// false. Examples: f16, f80.
	return false;
	case MVT::f32:
	case MVT::f64:
	case MVT::ppcf128:
	return Imm.isPosZero();
	}
	}

	// For vector shift operation op, fold
	// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
	static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
	SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	unsigned Opcode = N->getOpcode();
	unsigned TargetOpcode;

	switch (Opcode) {
	default:
	llvm_unreachable("Unexpected shift operation");
	case ISD::SHL:
	TargetOpcode = PPCISD::SHL;
	break;
	case ISD::SRL:
	TargetOpcode = PPCISD::SRL;
	break;
	case ISD::SRA:
	TargetOpcode = PPCISD::SRA;
	break;
	}

	if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
	N1->getOpcode() == ISD::AND)
	if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
	if (Mask->getZExtValue() == OpSizeInBits - 1)
	return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
	if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
	return Value;

	return SDValue();
	}

	bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
	if (!Subtarget.isSVR4ABI() \|\| !Subtarget.isPPC64())
	return false;

	// If not a tail call then no need to proceed.
	if (!CI->isTailCall())
	return false;

	// If tail calls are disabled for the caller then we are done.
	const Function *Caller = CI->getParent()->getParent();
	auto Attr = Caller->getFnAttribute("disable-tail-calls");
	if (Attr.getValueAsString() == "true")
	return false;

	// If sibling calls have been disabled and tail-calls aren't guaranteed
	// there is no reason to duplicate.
	auto &TM = getTargetMachine();
	if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
	return false;

	// Can't tail call a function called indirectly, or if it has variadic args.
	const Function *Callee = CI->getCalledFunction();
	if (!Callee \|\| Callee->isVarArg())
	return false;

	// Make sure the callee and caller calling conventions are eligible for tco.
	if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
	CI->getCallingConv()))
	return false;

	// If the function is local then we have a good chance at tail-calling it
	return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
	}
	Index: vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCISelLowering.h
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCISelLowering.h (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCISelLowering.h (revision 328362)
	@@ -1,1131 +1,1137 @@
	//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that PPC uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
	#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H

	#include "PPC.h"
	#include "PPCInstrInfo.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include <utility>

	namespace llvm {

	namespace PPCISD {

	// When adding a NEW PPCISD node please add it to the correct position in
	// the enum. The order of elements in this enum matters!
	// Values that are added after this entry:
	// STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE
	// are considerd memory opcodes and are treated differently than entries
	// that come before it. For example, ADD or MUL should be placed before
	// the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come
	// after it.
	enum NodeType : unsigned {
	// Start the numbering where the builtin ops and target ops leave off.
	FIRST_NUMBER = ISD::BUILTIN_OP_END,

	/// FSEL - Traditional three-operand fsel node.
	///
	FSEL,

	/// FCFID - The FCFID instruction, taking an f64 operand and producing
	/// and f64 value containing the FP representation of the integer that
	/// was temporarily in the f64 operand.
	FCFID,

	/// Newer FCFID[US] integer-to-floating-point conversion instructions for
	/// unsigned integers and single-precision outputs.
	FCFIDU, FCFIDS, FCFIDUS,

	/// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
	/// operand, producing an f64 value containing the integer representation
	/// of that FP value.
	FCTIDZ, FCTIWZ,

	/// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
	/// unsigned integers with round toward zero.
	FCTIDUZ, FCTIWUZ,

	/// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
	/// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
	VEXTS,

	/// SExtVElems, takes an input vector of a smaller type and sign
	/// extends to an output vector of a larger type.
	SExtVElems,

	/// Reciprocal estimate instructions (unary FP ops).
	FRE, FRSQRTE,

	// VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
	// three v4f32 operands and producing a v4f32 result.
	VMADDFP, VNMSUBFP,

	/// VPERM - The PPC VPERM Instruction.
	///
	VPERM,

	/// XXSPLT - The PPC VSX splat instructions
	///
	XXSPLT,

	/// VECINSERT - The PPC vector insert instruction
	///
	VECINSERT,

	/// XXREVERSE - The PPC VSX reverse instruction
	///
	XXREVERSE,

	/// VECSHL - The PPC vector shift left instruction
	///
	VECSHL,

	/// XXPERMDI - The PPC XXPERMDI instruction
	///
	XXPERMDI,

	/// The CMPB instruction (takes two operands of i32 or i64).
	CMPB,

	/// Hi/Lo - These represent the high and low 16-bit parts of a global
	/// address respectively. These nodes have two operands, the first of
	/// which must be a TargetGlobalAddress, and the second of which must be a
	/// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C',
	/// though these are usually folded into other nodes.
	Hi, Lo,

	/// The following two target-specific nodes are used for calls through
	/// function pointers in the 64-bit SVR4 ABI.

	/// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
	/// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
	/// compute an allocation on the stack.
	DYNALLOC,

	/// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
	/// compute an offset from native SP to the address of the most recent
	/// dynamic alloca.
	DYNAREAOFFSET,

	/// GlobalBaseReg - On Darwin, this node represents the result of the mflr
	/// at function entry, used for PIC code.
	GlobalBaseReg,

	/// These nodes represent PPC shifts.
	///
	/// For scalar types, only the last `n + 1` bits of the shift amounts
	/// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
	/// for exact behaviors.
	///
	/// For vector types, only the last n bits are used. See vsld.
	SRL, SRA, SHL,

	/// The combination of sra[wd]i and addze used to implemented signed
	/// integer division by a power of 2. The first operand is the dividend,
	/// and the second is the constant shift amount (representing the
	/// divisor).
	SRA_ADDZE,

	/// CALL - A direct function call.
	/// CALL_NOP is a call with the special NOP which follows 64-bit
	/// SVR4 calls.
	CALL, CALL_NOP,

	/// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
	/// MTCTR instruction.
	MTCTR,

	/// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
	/// BCTRL instruction.
	BCTRL,

	/// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
	/// instruction and the TOC reload required on SVR4 PPC64.
	BCTRL_LOAD_TOC,

	/// Return with a flag operand, matched by 'blr'
	RET_FLAG,

	/// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
	/// This copies the bits corresponding to the specified CRREG into the
	/// resultant GPR. Bits corresponding to other CR regs are undefined.
	MFOCRF,

	/// Direct move from a VSX register to a GPR
	MFVSR,

	/// Direct move from a GPR to a VSX register (algebraic)
	MTVSRA,

	/// Direct move from a GPR to a VSX register (zero)
	MTVSRZ,

	/// Extract a subvector from signed integer vector and convert to FP.
	/// It is primarily used to convert a (widened) illegal integer vector
	/// type to a legal floating point vector type.
	/// For example v2i32 -> widened to v4i32 -> v2f64
	SINT_VEC_TO_FP,

	/// Extract a subvector from unsigned integer vector and convert to FP.
	/// As with SINT_VEC_TO_FP, used for converting illegal types.
	UINT_VEC_TO_FP,

	// FIXME: Remove these once the ANDI glue bug is fixed:
	/// i1 = ANDIo_1_[EQ\|GT]_BIT(i32 or i64 x) - Represents the result of the
	/// eq or gt bit of CR0 after executing andi. x, 1. This is used to
	/// implement truncation of i32 or i64 to i1.
	ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT,

	// READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
	// target (returns (Lo, Hi)). It takes a chain operand.
	READ_TIME_BASE,

	// EH_SJLJ_SETJMP - SjLj exception handling setjmp.
	EH_SJLJ_SETJMP,

	// EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
	EH_SJLJ_LONGJMP,

	/// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
	/// instructions. For lack of better number, we use the opcode number
	/// encoding for the OPC field to identify the compare. For example, 838
	/// is VCMPGTSH.
	VCMP,

	/// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
	/// altivec VCMP*o instructions. For lack of better number, we use the
	/// opcode number encoding for the OPC field to identify the compare. For
	/// example, 838 is VCMPGTSH.
	VCMPo,

	/// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
	/// corresponds to the COND_BRANCH pseudo instruction. CRRC is the
	/// condition register to branch on, OPC is the branch opcode to use (e.g.
	/// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
	/// an optional input flag argument.
	COND_BRANCH,

	/// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
	/// loops.
	BDNZ, BDZ,

	/// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
	/// towards zero. Used only as part of the long double-to-int
	/// conversion sequence.
	FADDRTZ,

	/// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
	MFFS,

	/// TC_RETURN - A tail call return.
	/// operand #0 chain
	/// operand #1 callee (register or absolute)
	/// operand #2 stack adjustment
	/// operand #3 optional in flag
	TC_RETURN,

	/// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
	CR6SET,
	CR6UNSET,

	/// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
	/// on PPC32.
	PPC32_GOT,

	/// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
	/// local dynamic TLS on PPC32.
	PPC32_PICGOT,

	/// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
	/// TLS model, produces an ADDIS8 instruction that adds the GOT
	/// base to sym\@got\@tprel\@ha.
	ADDIS_GOT_TPREL_HA,

	/// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
	/// TLS model, produces a LD instruction with base register G8RReg
	/// and offset sym\@got\@tprel\@l. This completes the addition that
	/// finds the offset of "sym" relative to the thread pointer.
	LD_GOT_TPREL_L,

	/// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
	/// model, produces an ADD instruction that adds the contents of
	/// G8RReg to the thread pointer. Symbol contains a relocation
	/// sym\@tls which is to be replaced by the thread pointer and
	/// identifies to the linker that the instruction is part of a
	/// TLS sequence.
	ADD_TLS,

	/// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
	/// model, produces an ADDIS8 instruction that adds the GOT base
	/// register to sym\@got\@tlsgd\@ha.
	ADDIS_TLSGD_HA,

	/// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
	/// model, produces an ADDI8 instruction that adds G8RReg to
	/// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by
	/// ADDIS_TLSGD_L_ADDR until after register assignment.
	ADDI_TLSGD_L,

	/// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
	/// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by
	/// ADDIS_TLSGD_L_ADDR until after register assignment.
	GET_TLS_ADDR,

	/// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
	/// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
	/// register assignment.
	ADDI_TLSGD_L_ADDR,

	/// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
	/// model, produces an ADDIS8 instruction that adds the GOT base
	/// register to sym\@got\@tlsld\@ha.
	ADDIS_TLSLD_HA,

	/// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
	/// model, produces an ADDI8 instruction that adds G8RReg to
	/// sym\@got\@tlsld\@l and stores the result in X3. Hidden by
	/// ADDIS_TLSLD_L_ADDR until after register assignment.
	ADDI_TLSLD_L,

	/// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
	/// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by
	/// ADDIS_TLSLD_L_ADDR until after register assignment.
	GET_TLSLD_ADDR,

	/// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
	/// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
	/// following register assignment.
	ADDI_TLSLD_L_ADDR,

	/// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
	/// model, produces an ADDIS8 instruction that adds X3 to
	/// sym\@dtprel\@ha.
	ADDIS_DTPREL_HA,

	/// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
	/// model, produces an ADDI8 instruction that adds G8RReg to
	/// sym\@got\@dtprel\@l.
	ADDI_DTPREL_L,

	/// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
	/// during instruction selection to optimize a BUILD_VECTOR into
	/// operations on splats. This is necessary to avoid losing these
	/// optimizations due to constant folding.
	VADD_SPLAT,

	/// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned
	/// operand identifies the operating system entry point.
	SC,

	/// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer.
	CLRBHRB,

	/// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch
	/// history rolling buffer entry.
	MFBHRBE,

	/// CHAIN = RFEBB CHAIN, State - Return from event-based branch.
	RFEBB,

	/// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
	/// endian. Maps to an xxswapd instruction that corrects an lxvd2x
	/// or stxvd2x instruction. The chain is necessary because the
	/// sequence replaces a load and needs to provide the same number
	/// of outputs.
	XXSWAPD,

	/// An SDNode for swaps that are not associated with any loads/stores
	/// and thereby have no chain.
	SWAP_NO_CHAIN,

	/// QVFPERM = This corresponds to the QPX qvfperm instruction.
	QVFPERM,

	/// QVGPCI = This corresponds to the QPX qvgpci instruction.
	QVGPCI,

	/// QVALIGNI = This corresponds to the QPX qvaligni instruction.
	QVALIGNI,

	/// QVESPLATI = This corresponds to the QPX qvesplati instruction.
	QVESPLATI,

	/// QBFLT = Access the underlying QPX floating-point boolean
	/// representation.
	QBFLT,

	/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
	/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
	/// the GPRC input, then stores it through Ptr. Type can be either i16 or
	/// i32.
	STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE,

	/// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
	/// byte-swapping load instruction. It loads "Type" bits, byte swaps it,
	/// then puts it in the bottom bits of the GPRC. TYPE can be either i16
	/// or i32.
	LBRX,

	/// STFIWX - The STFIWX instruction. The first operand is an input token
	/// chain, then an f64 value to store, then an address to store it to.
	STFIWX,

	/// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
	/// load which sign-extends from a 32-bit integer value into the
	/// destination 64-bit register.
	LFIWAX,

	/// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
	/// load which zero-extends from a 32-bit integer value into the
	/// destination 64-bit register.
	LFIWZX,

	/// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
	/// integer smaller than 64 bits into a VSR. The integer is zero-extended.
	/// This can be used for converting loaded integers to floating point.
	LXSIZX,

	/// STXSIX - The STXSI[bh]X instruction. The first operand is an input
	/// chain, then an f64 value to store, then an address to store it to,
	/// followed by a byte-width for the store.
	STXSIX,

	/// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
	/// Maps directly to an lxvd2x instruction that will be followed by
	/// an xxswapd.
	LXVD2X,

	/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
	/// Maps directly to an stxvd2x instruction that will be preceded by
	/// an xxswapd.
	STXVD2X,

	/// QBRC, CHAIN = QVLFSb CHAIN, Ptr
	/// The 4xf32 load used for v4i1 constants.
	QVLFSb,

	+ /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
	+ /// except they ensure that the compare input is zero-extended for
	+ /// sub-word versions because the atomic loads zero-extend.
	+ ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16,
	+
	/// GPRC = TOC_ENTRY GA, TOC
	/// Loads the entry for GA from the TOC, where the TOC base is given by
	/// the last operand.
	TOC_ENTRY
	};

	} // end namespace PPCISD

	/// Define some predicates that are used for node matching.
	namespace PPC {

	/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUHUM instruction.
	bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUWUM instruction.
	bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
	/// VPKUDUM instruction.
	bool isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
	bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG);

	/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
	bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
	unsigned ShuffleKind, SelectionDAG &DAG);

	/// isVMRGEOShuffleMask - Return true if this is a shuffle mask suitable for
	/// a VMRGEW or VMRGOW instruction
	bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
	unsigned ShuffleKind, SelectionDAG &DAG);
	/// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXSLDWI instruction.
	bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE);

	/// isXXBRHShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRH instruction.
	bool isXXBRHShuffleMask(ShuffleVectorSDNode *N);

	/// isXXBRWShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRW instruction.
	bool isXXBRWShuffleMask(ShuffleVectorSDNode *N);

	/// isXXBRDShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRD instruction.
	bool isXXBRDShuffleMask(ShuffleVectorSDNode *N);

	/// isXXBRQShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXBRQ instruction.
	bool isXXBRQShuffleMask(ShuffleVectorSDNode *N);

	/// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable
	/// for a XXPERMDI instruction.
	bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	bool &Swap, bool IsLE);

	/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
	/// shift amount, otherwise return -1.
	int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
	SelectionDAG &DAG);

	/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
	/// specifies a splat of a single element that is suitable for input to
	/// VSPLTB/VSPLTH/VSPLTW.
	bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);

	/// isXXINSERTWMask - Return true if this VECTOR_SHUFFLE can be handled by
	/// the XXINSERTW instruction introduced in ISA 3.0. This is essentially any
	/// shuffle of v4f32/v4i32 vectors that just inserts one element from one
	/// vector into the other. This function will also set a couple of
	/// output parameters for how much the source vector needs to be shifted and
	/// what byte number needs to be specified for the instruction to put the
	/// element in the desired location of the target vector.
	bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
	unsigned &InsertAtByte, bool &Swap, bool IsLE);

	/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
	/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
	unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);

	/// get_VSPLTI_elt - If this is a build_vector of constants which can be
	/// formed by using a vspltis[bhw] instruction of the specified element
	/// size, return the constant being splatted. The ByteSize field indicates
	/// the number of bytes of each element [124] -> [bhw].
	SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);

	/// If this is a qvaligni shuffle mask, return the shift
	/// amount, otherwise return -1.
	int isQVALIGNIShuffleMask(SDNode *N);

	} // end namespace PPC

	class PPCTargetLowering : public TargetLowering {
	const PPCSubtarget &Subtarget;

	public:
	explicit PPCTargetLowering(const PPCTargetMachine &TM,
	const PPCSubtarget &STI);

	/// getTargetNodeName() - This method returns the name of a target specific
	/// DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	/// getPreferredVectorAction - The code we generate when vector types are
	/// legalized by promoting the integer element type is often much worse
	/// than code we generate if we widen the type for applicable vector types.
	/// The issue with promoting is that the vector is scalaraized, individual
	/// elements promoted and then the vector is rebuilt. So say we load a pair
	/// of v4i8's and shuffle them. This will turn into a mess of 8 extending
	/// loads, moves back into VSR's (or memory ops if we don't have moves) and
	/// then the VPERM for the shuffle. All in all a very slow sequence.
	TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
	const override {
	if (VT.getScalarSizeInBits() % 8 == 0)
	return TypeWidenVector;
	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	bool useSoftFloat() const override;

	MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
	return MVT::i32;
	}

	bool isCheapToSpeculateCttz() const override {
	return true;
	}

	bool isCheapToSpeculateCtlz() const override {
	return true;
	}

	bool isCtlzFast() const override {
	return true;
	}

	bool hasAndNotCompare(SDValue) const override {
	return true;
	}

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	bool supportSplitCSR(MachineFunction *MF) const override {
	return
	MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
	}

	void initializeSplitCSR(MachineBasicBlock *Entry) const override;

	void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

	/// getSetCCResultType - Return the ISD::SETCC ValueType
	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	/// Return true if target always beneficiates from combining into FMA for a
	/// given value type. This must typically return false on targets where FMA
	/// takes more cycles to execute than FADD.
	bool enableAggressiveFMAFusion(EVT VT) const override;

	/// getPreIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const override;

	/// SelectAddressRegReg - Given the specified addressed, check to see if it
	/// can be represented as an indexed [r+r] operation. Returns false if it
	/// can be more efficiently represented with [r+imm].
	bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
	SelectionDAG &DAG) const;

	/// SelectAddressRegImm - Returns true if the address N can be represented
	/// by a base register plus a signed 16-bit displacement [r+imm], and if it
	/// is not better represented as reg+reg. If Aligned is true, only accept
	/// displacements suitable for STD and friends, i.e. multiples of 4.
	bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
	SelectionDAG &DAG, unsigned Alignment) const;

	/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
	/// represented as an indexed [r+r] operation.
	bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
	SelectionDAG &DAG) const;

	Sched::Preference getSchedulingPreference(SDNode *N) const override;

	/// LowerOperation - Provide custom lowering hooks for some operations.
	///
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;

	/// ReplaceNodeResults - Replace the results of node with an illegal result
	/// type with new values built out of custom code.
	///
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const override;

	SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
	std::vector<SDNode > Created) const override;

	unsigned getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const override;

	void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	unsigned getPrefLoopAlignment(MachineLoop *ML) const override;

	bool shouldInsertFencesForAtomic(const Instruction *I) const override {
	return true;
	}

	Instruction emitLeadingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;
	Instruction emitTrailingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const override;
	MachineBasicBlock *EmitAtomicBinary(MachineInstr &MI,
	MachineBasicBlock *MBB,
	unsigned AtomicSize,
	unsigned BinOpcode,
	unsigned CmpOpcode = 0,
	unsigned CmpPred = 0) const;
	MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr &MI,
	MachineBasicBlock *MBB,
	bool is8bit,
	unsigned Opcode,
	unsigned CmpOpcode = 0,
	unsigned CmpPred = 0) const;

	MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	ConstraintType getConstraintType(StringRef Constraint) const override;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	ConstraintWeight getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const override;

	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. This is the actual
	/// alignment, not its logarithm.
	unsigned getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const override;

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	unsigned
	getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
	if (ConstraintCode == "es")
	return InlineAsm::Constraint_es;
	else if (ConstraintCode == "o")
	return InlineAsm::Constraint_o;
	else if (ConstraintCode == "Q")
	return InlineAsm::Constraint_Q;
	else if (ConstraintCode == "Z")
	return InlineAsm::Constraint_Z;
	else if (ConstraintCode == "Zy")
	return InlineAsm::Constraint_Zy;
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS,
	Instruction *I = nullptr) const override;

	/// isLegalICmpImmediate - Return true if the specified immediate is legal
	/// icmp immediate, that is the target has icmp instructions which can
	/// compare a register against the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalICmpImmediate(int64_t Imm) const override;

	/// isLegalAddImmediate - Return true if the specified immediate is legal
	/// add immediate, that is the target has add instructions which can
	/// add a register and the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalAddImmediate(int64_t Imm) const override;

	/// isTruncateFree - Return true if it's free to truncate a value of
	/// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
	/// register X1 to i32 by referencing its sub-register R1.
	bool isTruncateFree(Type Ty1, Type Ty2) const override;
	bool isTruncateFree(EVT VT1, EVT VT2) const override;

	bool isZExtFree(SDValue Val, EVT VT2) const override;

	bool isFPExtFree(EVT DestVT, EVT SrcVT) const override;

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override;

	bool convertSelectOfConstantsToMath(EVT VT) const override {
	return true;
	}

	bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;

	bool getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;

	/// getOptimalMemOpType - Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
	MachineFunction &MF) const override;

	/// Is unaligned memory access allowed for the given type, and is it fast
	/// relative to software emulation.
	bool allowsMisalignedMemoryAccesses(EVT VT,
	unsigned AddrSpace,
	unsigned Align = 1,
	bool *Fast = nullptr) const override;

	/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
	/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
	/// expanded to FMAs when this method returns true, otherwise fmuladd is
	/// expanded to fmul + fadd.
	bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;

	const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;

	// Should we expand the build vector with shuffles?
	bool
	shouldExpandBuildVectorWithShuffles(EVT VT,
	unsigned DefinedValues) const override;

	/// createFastISel - This method returns a target-specific FastISel object,
	/// or null if the target does not support "fast" instruction selection.
	FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo) const override;

	/// \brief Returns true if an argument of type Ty needs to be passed in a
	/// contiguous block of registers in calling convention CallConv.
	bool functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
	// We support any array type as "consecutive" block in the parameter
	// save area. The element type defines the alignment requirement and
	// whether the argument should go in GPRs, FPRs, or VRs if available.
	//
	// Note that clang uses this capability both to implement the ELFv2
	// homogeneous float/vector aggregate ABI, and to avoid having to use
	// "byval" when passing aggregates that might fully fit in registers.
	return Ty->isArrayTy();
	}

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	/// Override to support customized stack guard loading.
	bool useLoadStackGuardNode() const override;
	void insertSSPDeclarations(Module &M) const override;

	bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;

	unsigned getJumpTableEncoding() const override;
	bool isJumpTableRelative() const override;
	SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const override;
	const MCExpr getPICJumpTableRelocBaseExpr(const MachineFunction MF,
	unsigned JTI,
	MCContext &Ctx) const override;

	private:
	struct ReuseLoadInfo {
	SDValue Ptr;
	SDValue Chain;
	SDValue ResChain;
	MachinePointerInfo MPI;
	bool IsDereferenceable = false;
	bool IsInvariant = false;
	unsigned Alignment = 0;
	AAMDNodes AAInfo;
	const MDNode *Ranges = nullptr;

	ReuseLoadInfo() = default;

	MachineMemOperand::Flags MMOFlags() const {
	MachineMemOperand::Flags F = MachineMemOperand::MONone;
	if (IsDereferenceable)
	F \|= MachineMemOperand::MODereferenceable;
	if (IsInvariant)
	F \|= MachineMemOperand::MOInvariant;
	return F;
	}
	};

	bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
	SelectionDAG &DAG,
	ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
	void spliceIntoChain(SDValue ResChain, SDValue NewResChain,
	SelectionDAG &DAG) const;

	void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
	SelectionDAG &DAG, const SDLoc &dl) const;
	SDValue LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;

	bool directMoveIsProfitable(const SDValue &Op) const;
	SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;

	SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
	SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;

	bool
	IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;

	bool
	IsEligibleForTailCallOptimization_64SVR4(
	SDValue Callee,
	CallingConv::ID CalleeCC,
	ImmutableCallSite CS,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;

	SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG &DAG, int SPDiff,
	SDValue Chain, SDValue &LROpOut,
	SDValue &FPOpOut,
	const SDLoc &dl) const;

	SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
	const SDLoc &dl) const;
	SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
	+ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
	CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const;
	SDValue FinishCall(CallingConv::ID CallConv, const SDLoc &dl,
	bool isTailCall, bool isVarArg, bool isPatchPoint,
	bool hasNest, SelectionDAG &DAG,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
	SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
	SDValue &Callee, int SPDiff, unsigned NumBytes,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;

	SDValue
	LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;

	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const override;

	SDValue extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
	SelectionDAG &DAG, SDValue ArgVal,
	const SDLoc &dl) const;

	SDValue LowerFormalArguments_Darwin(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
	SDValue LowerFormalArguments_64SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
	SDValue LowerFormalArguments_32SVR4(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;

	SDValue createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
	SDValue CallSeqStart,
	ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
	const SDLoc &dl) const;

	SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;
	SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;
	SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
	CallingConv::ID CallConv, bool isVarArg,
	bool isTailCall, bool isPatchPoint,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	ImmutableCallSite CS) const;

	SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;

	SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;

	/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
	/// SETCC with integer subtraction when (1) there is a legal way of doing it
	/// (2) keeping the result of comparison in GPR has performance benefit.
	SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const;

	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps, bool &UseOneConstNR,
	bool Reciprocal) const override;
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const override;
	unsigned combineRepeatedFPDivisors() const override;

	CCAssignFn *useFastISelCCs(unsigned Flag) const;

	SDValue
	combineElementTruncationToVectorTruncation(SDNode *N,
	DAGCombinerInfo &DCI) const;

	/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
	/// handled by the VINSERTH instruction introduced in ISA 3.0. This is
	/// essentially any shuffle of v8i16 vectors that just inserts one element
	/// from one vector into the other.
	SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;

	/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be
	/// handled by the VINSERTB instruction introduced in ISA 3.0. This is
	/// essentially v16i8 vector version of VINSERTH.
	SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;

	// Return whether the call instruction can potentially be optimized to a
	// tail call. This will cause the optimizers to attempt to move, or
	// duplicate return instructions to help enable tail call optimizations.
	bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
	}; // end class PPCTargetLowering

	namespace PPC {

	FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
	const TargetLibraryInfo *LibInfo);

	} // end namespace PPC

	bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State);

	bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State);

	bool
	CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State);

	bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT,
	CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags,
	CCState &State);

	bool isIntS16Immediate(SDNode *N, int16_t &Imm);
	bool isIntS16Immediate(SDValue Op, int16_t &Imm);

	} // end namespace llvm

	#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
	Index: vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCInstrInfo.td
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCInstrInfo.td (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/PowerPC/PPCInstrInfo.td (revision 328362)
	@@ -1,4734 +1,4746 @@
	//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the subset of the 32-bit PowerPC instruction set, as used
	// by the PowerPC instruction selector.
	//
	//===----------------------------------------------------------------------===//

	include "PPCInstrFormats.td"

	//===----------------------------------------------------------------------===//
	// PowerPC specific type constraints.
	//
	def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
	SDTCisVT<0, f64>, SDTCisPtrTy<1>
	]>;
	def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
	SDTCisVT<0, f64>, SDTCisPtrTy<1>
	]>;
	def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
	SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
	]>;
	def SDT_PPCstxsix : SDTypeProfile<0, 3, [
	SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
	]>;
	def SDT_PPCVexts : SDTypeProfile<1, 2, [
	SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
	]>;
	def SDT_PPCSExtVElems : SDTypeProfile<1, 1, [
	SDTCisVec<0>, SDTCisVec<1>
	]>;

	def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>;
	def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>;
	def SDT_PPCvperm : SDTypeProfile<1, 3, [
	SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
	]>;

	def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
	SDTCisVec<1>, SDTCisInt<2>
	]>;

	def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
	SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
	]>;

	def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
	SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
	]>;

	def SDT_PPCVecReverse: SDTypeProfile<1, 1, [ SDTCisVec<0>,
	SDTCisVec<1>
	]>;

	def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>,
	SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
	]>;

	def SDT_PPCvcmp : SDTypeProfile<1, 3, [
	SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
	]>;

	def SDT_PPCcondbr : SDTypeProfile<0, 3, [
	SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
	]>;

	def SDT_PPClbrx : SDTypeProfile<1, 2, [
	SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
	]>;
	def SDT_PPCstbrx : SDTypeProfile<0, 3, [
	SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
	]>;

	def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
	SDTCisPtrTy<0>, SDTCisVT<1, i32>
	]>;

	def tocentry32 : Operand<iPTR> {
	let MIOperandInfo = (ops i32imm:$imm);
	}

	def SDT_PPCqvfperm : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVec<3>
	]>;
	def SDT_PPCqvgpci : SDTypeProfile<1, 1, [
	SDTCisVec<0>, SDTCisInt<1>
	]>;
	def SDT_PPCqvaligni : SDTypeProfile<1, 3, [
	SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>
	]>;
	def SDT_PPCqvesplati : SDTypeProfile<1, 2, [
	SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>
	]>;

	def SDT_PPCqbflt : SDTypeProfile<1, 1, [
	SDTCisVec<0>, SDTCisVec<1>
	]>;

	def SDT_PPCqvlfsb : SDTypeProfile<1, 1, [
	SDTCisVec<0>, SDTCisPtrTy<1>
	]>;

	//===----------------------------------------------------------------------===//
	// PowerPC specific DAG Nodes.
	//

	def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>;
	def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;

	def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>;
	def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>;
	def PPCfcfids : SDNode<"PPCISD::FCFIDS", SDTFPRoundOp, []>;
	def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
	def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
	def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
	def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
	def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
	def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
	[SDNPHasChain, SDNPMayStore]>;
	def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
	[SDNPHasChain, SDNPMayLoad]>;
	def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
	[SDNPHasChain, SDNPMayStore]>;
	def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
	def PPCSExtVElems : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>;

	// Extract FPSCR (not modeled at the DAG level).
	def PPCmffs : SDNode<"PPCISD::MFFS",
	SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;

	// Perform FADD in round-to-zero mode.
	def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;


	def PPCfsel : SDNode<"PPCISD::FSEL",
	// Type constraint for fsel.
	SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
	SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;

	def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
	def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
	def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
	[SDNPMayLoad, SDNPMemOperand]>;
	def PPCvmaddfp : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
	def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;

	def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;

	def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
	def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
	[SDNPMayLoad]>;
	def PPCaddTls : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
	def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
	def PPCaddiTlsgdL : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
	def PPCgetTlsAddr : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
	def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
	SDTypeProfile<1, 3, [
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
	def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
	def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
	def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
	def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
	SDTypeProfile<1, 3, [
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
	def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
	def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;

	def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
	def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
	def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
	def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>;
	def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
	def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;

	def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
	def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
	def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
	def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;

	def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;

	def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
	[SDNPHasChain, SDNPMayLoad]>;

	def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;

	// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
	// amounts. These nodes are generated by the multi-precision shift code.
	def PPCsrl : SDNode<"PPCISD::SRL" , SDTIntShiftOp>;
	def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>;
	def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>;

	// These are target-independent nodes, but have target-specific formats.
	def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
	[SDNPHasChain, SDNPOutGlue]>;
	def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PPCCallSeqEnd,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def SDT_PPCCall : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
	def PPCcall : SDNode<"PPCISD::CALL", SDT_PPCCall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def PPCcall_nop : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
	def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
	SDTypeProfile<0, 1, []>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;

	def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def PPCeh_sjlj_setjmp : SDNode<"PPCISD::EH_SJLJ_SETJMP",
	SDTypeProfile<1, 1, [SDTCisInt<0>,
	SDTCisPtrTy<1>]>,
	[SDNPHasChain, SDNPSideEffect]>;
	def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
	SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPSideEffect]>;

	def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
	def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc,
	[SDNPHasChain, SDNPSideEffect]>;

	def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone,
	[SDNPHasChain, SDNPSideEffect]>;
	def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>;
	def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
	[SDNPHasChain, SDNPSideEffect]>;

	def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
	def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;

	def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
	[SDNPHasChain, SDNPOptInGlue]>;

	+// PPC-specific atomic operations.
	+def PPCatomicCmpSwap_8 :
	+ SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
	+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	+def PPCatomicCmpSwap_16 :
	+ SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
	+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
	[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
	def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
	[SDNPHasChain, SDNPMayStore]>;

	// Instructions to set/unset CR bit 6 for SVR4 vararg calls
	def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
	def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	// Instructions to support dynamic alloca.
	def SDTDynOp : SDTypeProfile<1, 2, []>;
	def SDTDynAreaOp : SDTypeProfile<1, 1, []>;
	def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
	def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;

	//===----------------------------------------------------------------------===//
	// PowerPC specific transformation functions and pattern fragments.
	//

	def SHL32 : SDNodeXForm<imm, [{
	// Transformation function: 31 - imm
	return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
	}]>;

	def SRL32 : SDNodeXForm<imm, [{
	// Transformation function: 32 - imm
	return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue(), SDLoc(N))
	: getI32Imm(0, SDLoc(N));
	}]>;

	def LO16 : SDNodeXForm<imm, [{
	// Transformation function: get the low 16 bits.
	return getI32Imm((unsigned short)N->getZExtValue(), SDLoc(N));
	}]>;

	def HI16 : SDNodeXForm<imm, [{
	// Transformation function: shift the immediate value down into the low bits.
	return getI32Imm((unsigned)N->getZExtValue() >> 16, SDLoc(N));
	}]>;

	def HA16 : SDNodeXForm<imm, [{
	// Transformation function: shift the immediate value down into the low bits.
	int Val = N->getZExtValue();
	return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
	}]>;
	def MB : SDNodeXForm<imm, [{
	// Transformation function: get the start bit of a mask
	unsigned mb = 0, me;
	(void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
	return getI32Imm(mb, SDLoc(N));
	}]>;

	def ME : SDNodeXForm<imm, [{
	// Transformation function: get the end bit of a mask
	unsigned mb, me = 0;
	(void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
	return getI32Imm(me, SDLoc(N));
	}]>;
	def maskimm32 : PatLeaf<(imm), [{
	// maskImm predicate - True if immediate is a run of ones.
	unsigned mb, me;
	if (N->getValueType(0) == MVT::i32)
	return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
	else
	return false;
	}]>;

	def imm32SExt16 : Operand<i32>, ImmLeaf<i32, [{
	// imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
	// sign extended field. Used by instructions like 'addi'.
	return (int32_t)Imm == (short)Imm;
	}]>;
	def imm64SExt16 : Operand<i64>, ImmLeaf<i64, [{
	// imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
	// sign extended field. Used by instructions like 'addi'.
	return (int64_t)Imm == (short)Imm;
	}]>;
	def immZExt16 : PatLeaf<(imm), [{
	// immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
	// field. Used by instructions like 'ori'.
	return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
	}], LO16>;
	def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) \|\| isUInt<8>(Imm); }]>;
	def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;

	// imm16Shifted* - These match immediates where the low 16-bits are zero. There
	// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are
	// identical in 32-bit mode, but in 64-bit mode, they return true if the
	// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
	// clear).
	def imm16ShiftedZExt : PatLeaf<(imm), [{
	// imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
	// immediate are set. Used by instructions like 'xoris'.
	return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
	}], HI16>;

	def imm16ShiftedSExt : PatLeaf<(imm), [{
	// imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
	// immediate are set. Used by instructions like 'addis'. Identical to
	// imm16ShiftedZExt in 32-bit mode.
	if (N->getZExtValue() & 0xFFFF) return false;
	if (N->getValueType(0) == MVT::i32)
	return true;
	// For 64-bit, make sure it is sext right.
	return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
	}], HI16>;

	def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{
	// imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit
	// zero extended field.
	return isUInt<32>(Imm);
	}]>;

	// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
	// restricted memrix (4-aligned) constants are alignment sensitive. If these
	// offsets are hidden behind TOC entries than the values of the lower-order
	// bits cannot be checked directly. As a result, we need to also incorporate
	// an alignment check into the relevant patterns.

	def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
	return cast<LoadSDNode>(N)->getAlignment() >= 4;
	}]>;
	def aligned4store : PatFrag<(ops node:$val, node:$ptr),
	(store node:$val, node:$ptr), [{
	return cast<StoreSDNode>(N)->getAlignment() >= 4;
	}]>;
	def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
	return cast<LoadSDNode>(N)->getAlignment() >= 4;
	}]>;
	def aligned4pre_store : PatFrag<
	(ops node:$val, node:$base, node:$offset),
	(pre_store node:$val, node:$base, node:$offset), [{
	return cast<StoreSDNode>(N)->getAlignment() >= 4;
	}]>;

	def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
	return cast<LoadSDNode>(N)->getAlignment() < 4;
	}]>;
	def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
	(store node:$val, node:$ptr), [{
	return cast<StoreSDNode>(N)->getAlignment() < 4;
	}]>;
	def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
	return cast<LoadSDNode>(N)->getAlignment() < 4;
	}]>;

	// This is a somewhat weaker condition than actually checking for 16-byte
	// alignment. It is simply checking that the displacement can be represented
	// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form
	// instructions).
	def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
	return isOffsetMultipleOf(N, 16);
	}]>;
	def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
	(store node:$val, node:$ptr), [{
	return isOffsetMultipleOf(N, 16);
	}]>;
	def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
	return !isOffsetMultipleOf(N, 16);
	}]>;
	def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
	(store node:$val, node:$ptr), [{
	return !isOffsetMultipleOf(N, 16);
	}]>;

	//===----------------------------------------------------------------------===//
	// PowerPC Flag Definitions.

	class isPPC64 { bit PPC64 = 1; }
	class isDOT { bit RC = 1; }

	class RegConstraint<string C> {
	string Constraints = C;
	}
	class NoEncode<string E> {
	string DisableEncoding = E;
	}


	//===----------------------------------------------------------------------===//
	// PowerPC Operand Definitions.

	// In the default PowerPC assembler syntax, registers are specified simply
	// by number, so they cannot be distinguished from immediate values (without
	// looking at the opcode). This means that the default operand matching logic
	// for the asm parser does not work, and we need to specify custom matchers.
	// Since those can only be specified with RegisterOperand classes and not
	// directly on the RegisterClass, all instructions patterns used by the asm
	// parser need to use a RegisterOperand (instead of a RegisterClass) for
	// all their register operands.
	// For this purpose, we define one RegisterOperand for each RegisterClass,
	// using the same name as the class, just in lower case.

	def PPCRegGPRCAsmOperand : AsmOperandClass {
	let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
	}
	def gprc : RegisterOperand<GPRC> {
	let ParserMatchClass = PPCRegGPRCAsmOperand;
	}
	def PPCRegG8RCAsmOperand : AsmOperandClass {
	let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
	}
	def g8rc : RegisterOperand<G8RC> {
	let ParserMatchClass = PPCRegG8RCAsmOperand;
	}
	def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
	let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
	}
	def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
	let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
	}
	def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
	let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
	}
	def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
	let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
	}
	def PPCRegF8RCAsmOperand : AsmOperandClass {
	let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
	}
	def f8rc : RegisterOperand<F8RC> {
	let ParserMatchClass = PPCRegF8RCAsmOperand;
	}
	def PPCRegF4RCAsmOperand : AsmOperandClass {
	let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
	}
	def f4rc : RegisterOperand<F4RC> {
	let ParserMatchClass = PPCRegF4RCAsmOperand;
	}
	def PPCRegVRRCAsmOperand : AsmOperandClass {
	let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
	}
	def vrrc : RegisterOperand<VRRC> {
	let ParserMatchClass = PPCRegVRRCAsmOperand;
	}
	def PPCRegVFRCAsmOperand : AsmOperandClass {
	let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
	}
	def vfrc : RegisterOperand<VFRC> {
	let ParserMatchClass = PPCRegVFRCAsmOperand;
	}
	def PPCRegCRBITRCAsmOperand : AsmOperandClass {
	let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
	}
	def crbitrc : RegisterOperand<CRBITRC> {
	let ParserMatchClass = PPCRegCRBITRCAsmOperand;
	}
	def PPCRegCRRCAsmOperand : AsmOperandClass {
	let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
	}
	def crrc : RegisterOperand<CRRC> {
	let ParserMatchClass = PPCRegCRRCAsmOperand;
	}
	def crrc0 : RegisterOperand<CRRC0> {
	let ParserMatchClass = PPCRegCRRCAsmOperand;
	}

	def PPCU1ImmAsmOperand : AsmOperandClass {
	let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
	let RenderMethod = "addImmOperands";
	}
	def u1imm : Operand<i32> {
	let PrintMethod = "printU1ImmOperand";
	let ParserMatchClass = PPCU1ImmAsmOperand;
	}

	def PPCU2ImmAsmOperand : AsmOperandClass {
	let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
	let RenderMethod = "addImmOperands";
	}
	def u2imm : Operand<i32> {
	let PrintMethod = "printU2ImmOperand";
	let ParserMatchClass = PPCU2ImmAsmOperand;
	}

	def PPCATBitsAsHintAsmOperand : AsmOperandClass {
	let Name = "ATBitsAsHint"; let PredicateMethod = "isATBitsAsHint";
	let RenderMethod = "addImmOperands"; // Irrelevant, predicate always fails.
	}
	def atimm : Operand<i32> {
	let PrintMethod = "printATBitsAsHint";
	let ParserMatchClass = PPCATBitsAsHintAsmOperand;
	}

	def PPCU3ImmAsmOperand : AsmOperandClass {
	let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
	let RenderMethod = "addImmOperands";
	}
	def u3imm : Operand<i32> {
	let PrintMethod = "printU3ImmOperand";
	let ParserMatchClass = PPCU3ImmAsmOperand;
	}

	def PPCU4ImmAsmOperand : AsmOperandClass {
	let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
	let RenderMethod = "addImmOperands";
	}
	def u4imm : Operand<i32> {
	let PrintMethod = "printU4ImmOperand";
	let ParserMatchClass = PPCU4ImmAsmOperand;
	}
	def PPCS5ImmAsmOperand : AsmOperandClass {
	let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
	let RenderMethod = "addImmOperands";
	}
	def s5imm : Operand<i32> {
	let PrintMethod = "printS5ImmOperand";
	let ParserMatchClass = PPCS5ImmAsmOperand;
	let DecoderMethod = "decodeSImmOperand<5>";
	}
	def PPCU5ImmAsmOperand : AsmOperandClass {
	let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
	let RenderMethod = "addImmOperands";
	}
	def u5imm : Operand<i32> {
	let PrintMethod = "printU5ImmOperand";
	let ParserMatchClass = PPCU5ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<5>";
	}
	def PPCU6ImmAsmOperand : AsmOperandClass {
	let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
	let RenderMethod = "addImmOperands";
	}
	def u6imm : Operand<i32> {
	let PrintMethod = "printU6ImmOperand";
	let ParserMatchClass = PPCU6ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<6>";
	}
	def PPCU7ImmAsmOperand : AsmOperandClass {
	let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
	let RenderMethod = "addImmOperands";
	}
	def u7imm : Operand<i32> {
	let PrintMethod = "printU7ImmOperand";
	let ParserMatchClass = PPCU7ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<7>";
	}
	def PPCU8ImmAsmOperand : AsmOperandClass {
	let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
	let RenderMethod = "addImmOperands";
	}
	def u8imm : Operand<i32> {
	let PrintMethod = "printU8ImmOperand";
	let ParserMatchClass = PPCU8ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<8>";
	}
	def PPCU10ImmAsmOperand : AsmOperandClass {
	let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
	let RenderMethod = "addImmOperands";
	}
	def u10imm : Operand<i32> {
	let PrintMethod = "printU10ImmOperand";
	let ParserMatchClass = PPCU10ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<10>";
	}
	def PPCU12ImmAsmOperand : AsmOperandClass {
	let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
	let RenderMethod = "addImmOperands";
	}
	def u12imm : Operand<i32> {
	let PrintMethod = "printU12ImmOperand";
	let ParserMatchClass = PPCU12ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<12>";
	}
	def PPCS16ImmAsmOperand : AsmOperandClass {
	let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
	let RenderMethod = "addS16ImmOperands";
	}
	def s16imm : Operand<i32> {
	let PrintMethod = "printS16ImmOperand";
	let EncoderMethod = "getImm16Encoding";
	let ParserMatchClass = PPCS16ImmAsmOperand;
	let DecoderMethod = "decodeSImmOperand<16>";
	}
	def PPCU16ImmAsmOperand : AsmOperandClass {
	let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
	let RenderMethod = "addU16ImmOperands";
	}
	def u16imm : Operand<i32> {
	let PrintMethod = "printU16ImmOperand";
	let EncoderMethod = "getImm16Encoding";
	let ParserMatchClass = PPCU16ImmAsmOperand;
	let DecoderMethod = "decodeUImmOperand<16>";
	}
	def PPCS17ImmAsmOperand : AsmOperandClass {
	let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
	let RenderMethod = "addS16ImmOperands";
	}
	def s17imm : Operand<i32> {
	// This operand type is used for addis/lis to allow the assembler parser
	// to accept immediates in the range -65536..65535 for compatibility with
	// the GNU assembler. The operand is treated as 16-bit otherwise.
	let PrintMethod = "printS16ImmOperand";
	let EncoderMethod = "getImm16Encoding";
	let ParserMatchClass = PPCS17ImmAsmOperand;
	let DecoderMethod = "decodeSImmOperand<16>";
	}

	def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;

	def PPCDirectBrAsmOperand : AsmOperandClass {
	let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
	let RenderMethod = "addBranchTargetOperands";
	}
	def directbrtarget : Operand<OtherVT> {
	let PrintMethod = "printBranchOperand";
	let EncoderMethod = "getDirectBrEncoding";
	let ParserMatchClass = PPCDirectBrAsmOperand;
	}
	def absdirectbrtarget : Operand<OtherVT> {
	let PrintMethod = "printAbsBranchOperand";
	let EncoderMethod = "getAbsDirectBrEncoding";
	let ParserMatchClass = PPCDirectBrAsmOperand;
	}
	def PPCCondBrAsmOperand : AsmOperandClass {
	let Name = "CondBr"; let PredicateMethod = "isCondBr";
	let RenderMethod = "addBranchTargetOperands";
	}
	def condbrtarget : Operand<OtherVT> {
	let PrintMethod = "printBranchOperand";
	let EncoderMethod = "getCondBrEncoding";
	let ParserMatchClass = PPCCondBrAsmOperand;
	}
	def abscondbrtarget : Operand<OtherVT> {
	let PrintMethod = "printAbsBranchOperand";
	let EncoderMethod = "getAbsCondBrEncoding";
	let ParserMatchClass = PPCCondBrAsmOperand;
	}
	def calltarget : Operand<iPTR> {
	let PrintMethod = "printBranchOperand";
	let EncoderMethod = "getDirectBrEncoding";
	let ParserMatchClass = PPCDirectBrAsmOperand;
	}
	def abscalltarget : Operand<iPTR> {
	let PrintMethod = "printAbsBranchOperand";
	let EncoderMethod = "getAbsDirectBrEncoding";
	let ParserMatchClass = PPCDirectBrAsmOperand;
	}
	def PPCCRBitMaskOperand : AsmOperandClass {
	let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
	}
	def crbitm: Operand<i8> {
	let PrintMethod = "printcrbitm";
	let EncoderMethod = "get_crbitm_encoding";
	let DecoderMethod = "decodeCRBitMOperand";
	let ParserMatchClass = PPCCRBitMaskOperand;
	}
	// Address operands
	// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
	def PPCRegGxRCNoR0Operand : AsmOperandClass {
	let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
	}
	def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
	let ParserMatchClass = PPCRegGxRCNoR0Operand;
	}
	// A version of ptr_rc usable with the asm parser.
	def PPCRegGxRCOperand : AsmOperandClass {
	let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
	}
	def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
	let ParserMatchClass = PPCRegGxRCOperand;
	}

	def PPCDispRIOperand : AsmOperandClass {
	let Name = "DispRI"; let PredicateMethod = "isS16Imm";
	let RenderMethod = "addS16ImmOperands";
	}
	def dispRI : Operand<iPTR> {
	let ParserMatchClass = PPCDispRIOperand;
	}
	def PPCDispRIXOperand : AsmOperandClass {
	let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
	let RenderMethod = "addImmOperands";
	}
	def dispRIX : Operand<iPTR> {
	let ParserMatchClass = PPCDispRIXOperand;
	}
	def PPCDispRIX16Operand : AsmOperandClass {
	let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
	let RenderMethod = "addImmOperands";
	}
	def dispRIX16 : Operand<iPTR> {
	let ParserMatchClass = PPCDispRIX16Operand;
	}
	def PPCDispSPE8Operand : AsmOperandClass {
	let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
	let RenderMethod = "addImmOperands";
	}
	def dispSPE8 : Operand<iPTR> {
	let ParserMatchClass = PPCDispSPE8Operand;
	}
	def PPCDispSPE4Operand : AsmOperandClass {
	let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
	let RenderMethod = "addImmOperands";
	}
	def dispSPE4 : Operand<iPTR> {
	let ParserMatchClass = PPCDispSPE4Operand;
	}
	def PPCDispSPE2Operand : AsmOperandClass {
	let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
	let RenderMethod = "addImmOperands";
	}
	def dispSPE2 : Operand<iPTR> {
	let ParserMatchClass = PPCDispSPE2Operand;
	}

	def memri : Operand<iPTR> {
	let PrintMethod = "printMemRegImm";
	let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
	let EncoderMethod = "getMemRIEncoding";
	let DecoderMethod = "decodeMemRIOperands";
	}
	def memrr : Operand<iPTR> {
	let PrintMethod = "printMemRegReg";
	let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
	}
	def memrix : Operand<iPTR> { // memri where the imm is 4-aligned.
	let PrintMethod = "printMemRegImm";
	let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
	let EncoderMethod = "getMemRIXEncoding";
	let DecoderMethod = "decodeMemRIXOperands";
	}
	def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
	let PrintMethod = "printMemRegImm";
	let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
	let EncoderMethod = "getMemRIX16Encoding";
	let DecoderMethod = "decodeMemRIX16Operands";
	}
	def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned.
	let PrintMethod = "printMemRegImm";
	let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
	let EncoderMethod = "getSPE8DisEncoding";
	}
	def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned.
	let PrintMethod = "printMemRegImm";
	let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
	let EncoderMethod = "getSPE4DisEncoding";
	}
	def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned.
	let PrintMethod = "printMemRegImm";
	let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
	let EncoderMethod = "getSPE2DisEncoding";
	}

	// A single-register address. This is used with the SjLj
	// pseudo-instructions which tranlates to LD/LWZ. These instructions requires
	// G8RC_NOX0 registers.
	def memr : Operand<iPTR> {
	let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
	}
	def PPCTLSRegOperand : AsmOperandClass {
	let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
	let RenderMethod = "addTLSRegOperands";
	}
	def tlsreg32 : Operand<i32> {
	let EncoderMethod = "getTLSRegEncoding";
	let ParserMatchClass = PPCTLSRegOperand;
	}
	def tlsgd32 : Operand<i32> {}
	def tlscall32 : Operand<i32> {
	let PrintMethod = "printTLSCall";
	let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
	let EncoderMethod = "getTLSCallEncoding";
	}

	// PowerPC Predicate operand.
	def pred : Operand<OtherVT> {
	let PrintMethod = "printPredicateOperand";
	let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
	}

	// Define PowerPC specific addressing mode.
	def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>;
	def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>;
	def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
	def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
	def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"

	// The address in a single register. This is used with the SjLj
	// pseudo-instructions.
	def addr : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;

	/// This is just the offset part of iaddr, used for preinc.
	def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;

	//===----------------------------------------------------------------------===//
	// PowerPC Instruction Predicate Definitions.
	def In32BitMode : Predicate<"!PPCSubTarget->isPPC64()">;
	def In64BitMode : Predicate<"PPCSubTarget->isPPC64()">;
	def IsBookE : Predicate<"PPCSubTarget->isBookE()">;
	def IsNotBookE : Predicate<"!PPCSubTarget->isBookE()">;
	def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">;
	def HasSYNC : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
	def IsPPC4xx : Predicate<"PPCSubTarget->isPPC4xx()">;
	def IsPPC6xx : Predicate<"PPCSubTarget->isPPC6xx()">;
	def IsE500 : Predicate<"PPCSubTarget->isE500()">;
	def HasSPE : Predicate<"PPCSubTarget->HasSPE()">;
	def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
	def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
	def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
	def NaNsFPMath : Predicate<"!TM.Options.NoNaNsFPMath">;
	def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
	def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
	def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;

	//===----------------------------------------------------------------------===//
	// PowerPC Multiclass Definitions.

	multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XForm_6<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : XForm_6<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	let Defs = [CARRY] in
	def NAME : XForm_6<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CARRY, CR0] in
	def o : XForm_6<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	let Defs = [CARRY] in
	def NAME : XForm_10<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CARRY, CR0] in
	def o : XForm_10<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XForm_11<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : XForm_11<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : XOForm_1<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	// Multiclass for instructions for which the non record form is not cracked
	// and the record form is cracked (i.e. divw, mullw, etc.)
	multiclass XOForm_1rcr<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : XOForm_1<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel, PPC970_DGroup_First,
	PPC970_DGroup_Cracked;
	}
	}

	multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	let Defs = [CARRY] in
	def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CARRY, CR0] in
	def o : XOForm_1<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : XOForm_3<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	let Defs = [CARRY] in
	def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CARRY, CR0] in
	def o : XOForm_3<opcode, xo, oe, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : MForm_2<opcode, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : MForm_2<opcode, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : MDForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : MDForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : MDSForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : MDSForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	let Defs = [CARRY] in
	def NAME : XSForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CARRY, CR0] in
	def o : XSForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XSForm_1r<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XSForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR0] in
	def o : XSForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XForm_26<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR1] in
	def o : XForm_26<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : XForm_28<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR1] in
	def o : XForm_28<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : AForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR1] in
	def o : AForm_1<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : AForm_2<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR1] in
	def o : AForm_2<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
	string asmbase, string asmstr, InstrItinClass itin,
	list<dag> pattern> {
	let BaseName = asmbase in {
	def NAME : AForm_3<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
	pattern>, RecFormRel;
	let Defs = [CR1] in
	def o : AForm_3<opcode, xo, OOL, IOL,
	!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
	[]>, isDOT, RecFormRel;
	}
	}

	//===----------------------------------------------------------------------===//
	// PowerPC Instruction Definitions.

	// Pseudo-instructions:

	let hasCtrlDep = 1 in {
	let Defs = [R1], Uses = [R1] in {
	def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
	"#ADJCALLSTACKDOWN $amt1 $amt2",
	[(callseq_start timm:$amt1, timm:$amt2)]>;
	def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
	"#ADJCALLSTACKUP $amt1 $amt2",
	[(callseq_end timm:$amt1, timm:$amt2)]>;
	}

	def UPDATE_VRSAVE : Pseudo<(outs gprc:$rD), (ins gprc:$rS),
	"UPDATE_VRSAVE $rD, $rS", []>;
	}

	let Defs = [R1], Uses = [R1] in
	def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
	[(set i32:$result,
	(PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
	def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
	[(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;

	// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
	// instruction selection into a branch sequence.
	let usesCustomInserter = 1, // Expanded after instruction selection.
	PPC970_Single = 1 in {
	// Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
	// because either operand might become the first operand in an isel, and
	// that operand cannot be r0.
	def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond,
	gprc_nor0:$T, gprc_nor0:$F,
	i32imm:$BROPC), "#SELECT_CC_I4",
	[]>;
	def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond,
	g8rc_nox0:$T, g8rc_nox0:$F,
	i32imm:$BROPC), "#SELECT_CC_I8",
	[]>;
	def SELECT_CC_F4 : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
	i32imm:$BROPC), "#SELECT_CC_F4",
	[]>;
	def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
	i32imm:$BROPC), "#SELECT_CC_F8",
	[]>;
	def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
	i32imm:$BROPC), "#SELECT_CC_VRRC",
	[]>;

	// SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
	// register bit directly.
	def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
	gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
	[(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
	def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
	g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
	[(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
	def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
	f4rc:$T, f4rc:$F), "#SELECT_F4",
	[(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
	def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
	f8rc:$T, f8rc:$F), "#SELECT_F8",
	[(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
	def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
	vrrc:$T, vrrc:$F), "#SELECT_VRRC",
	[(set v4i32:$dst,
	(select i1:$cond, v4i32:$T, v4i32:$F))]>;
	}

	// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
	// scavenge a register for it.
	let mayStore = 1 in {
	def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
	"#SPILL_CR", []>;
	def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
	"#SPILL_CRBIT", []>;
	}

	// RESTORE_CR - Indicate that we're restoring the CR register (previously
	// spilled), so we'll need to scavenge a register for it.
	let mayLoad = 1 in {
	def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
	"#RESTORE_CR", []>;
	def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
	"#RESTORE_CRBIT", []>;
	}

	let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
	let isReturn = 1, Uses = [LR, RM] in
	def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
	[(retflag)]>, Requires<[In32BitMode]>;
	let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
	def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
	[]>;

	let isCodeGenOnly = 1 in {
	def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
	"b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
	[]>;

	def BCCTR : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
	"bcctr 12, $bi, 0", IIC_BrB, []>;
	def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
	"bcctr 4, $bi, 0", IIC_BrB, []>;
	}
	}
	}

	let Defs = [LR] in
	def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
	PPC970_Unit_BRU;
	let Defs = [LR] in
	def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>,
	PPC970_Unit_BRU;

	let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
	let isBarrier = 1 in {
	def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
	"b $dst", IIC_BrB,
	[(br bb:$dst)]>;
	def BA : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
	"ba $dst", IIC_BrB, []>;
	}

	// BCC represents an arbitrary conditional branch on a predicate.
	// FIXME: should be able to write a pattern for PPCcondbranch, but can't use
	// a two-value operand where a dag node expects two operands. :(
	let isCodeGenOnly = 1 in {
	class BCC_class : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
	"b${cond:cc}${cond:pm} ${cond:reg}, $dst"
	/[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]/>;
	def BCC : BCC_class;

	// The same as BCC, except that it's not a terminator. Used for introducing
	// control flow dependency without creating new blocks.
	let isTerminator = 0 in def CTRL_DEP : BCC_class;

	def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
	"b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;

	let isReturn = 1, Uses = [LR, RM] in
	def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
	"b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>;
	}

	let isCodeGenOnly = 1 in {
	let Pattern = [(brcond i1:$bi, bb:$dst)] in
	def BC : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
	"bc 12, $bi, $dst">;

	let Pattern = [(brcond (not i1:$bi), bb:$dst)] in
	def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
	"bc 4, $bi, $dst">;

	let isReturn = 1, Uses = [LR, RM] in
	def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
	"bclr 12, $bi, 0", IIC_BrB, []>;
	def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
	"bclr 4, $bi, 0", IIC_BrB, []>;
	}

	let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
	def BDZLR : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
	"bdzlr", IIC_BrB, []>;
	def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
	"bdnzlr", IIC_BrB, []>;
	def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
	"bdzlr+", IIC_BrB, []>;
	def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
	"bdnzlr+", IIC_BrB, []>;
	def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
	"bdzlr-", IIC_BrB, []>;
	def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
	"bdnzlr-", IIC_BrB, []>;
	}

	let Defs = [CTR], Uses = [CTR] in {
	def BDZ : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
	"bdz $dst">;
	def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
	"bdnz $dst">;
	def BDZA : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst),
	"bdza $dst">;
	def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst),
	"bdnza $dst">;
	def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst),
	"bdz+ $dst">;
	def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst),
	"bdnz+ $dst">;
	def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst),
	"bdza+ $dst">;
	def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst),
	"bdnza+ $dst">;
	def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst),
	"bdz- $dst">;
	def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst),
	"bdnz- $dst">;
	def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst),
	"bdza- $dst">;
	def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst),
	"bdnza- $dst">;
	}
	}

	// The unconditional BCL used by the SjLj setjmp code.
	let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
	let Defs = [LR], Uses = [RM] in {
	def BCLalways : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
	"bcl 20, 31, $dst">;
	}
	}

	let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
	// Convenient aliases for call instructions
	let Uses = [RM] in {
	def BL : IForm<18, 0, 1, (outs), (ins calltarget:$func),
	"bl $func", IIC_BrB, []>; // See Pat patterns below.
	def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
	"bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;

	let isCodeGenOnly = 1 in {
	def BL_TLS : IForm<18, 0, 1, (outs), (ins tlscall32:$func),
	"bl $func", IIC_BrB, []>;
	def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
	"b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
	def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
	"b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;

	def BCL : BForm_4<16, 12, 0, 1, (outs),
	(ins crbitrc:$bi, condbrtarget:$dst),
	"bcl 12, $bi, $dst">;
	def BCLn : BForm_4<16, 4, 0, 1, (outs),
	(ins crbitrc:$bi, condbrtarget:$dst),
	"bcl 4, $bi, $dst">;
	}
	}
	let Uses = [CTR, RM] in {
	def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
	"bctrl", IIC_BrB, [(PPCbctrl)]>,
	Requires<[In32BitMode]>;

	let isCodeGenOnly = 1 in {
	def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
	"b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
	[]>;

	def BCCTRL : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
	"bcctrl 12, $bi, 0", IIC_BrB, []>;
	def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
	"bcctrl 4, $bi, 0", IIC_BrB, []>;
	}
	}
	let Uses = [LR, RM] in {
	def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
	"blrl", IIC_BrB, []>;

	let isCodeGenOnly = 1 in {
	def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
	"b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB,
	[]>;

	def BCLRL : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi),
	"bclrl 12, $bi, 0", IIC_BrB, []>;
	def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi),
	"bclrl 4, $bi, 0", IIC_BrB, []>;
	}
	}
	let Defs = [CTR], Uses = [CTR, RM] in {
	def BDZL : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
	"bdzl $dst">;
	def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst),
	"bdnzl $dst">;
	def BDZLA : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst),
	"bdzla $dst">;
	def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst),
	"bdnzla $dst">;
	def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst),
	"bdzl+ $dst">;
	def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst),
	"bdnzl+ $dst">;
	def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst),
	"bdzla+ $dst">;
	def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst),
	"bdnzla+ $dst">;
	def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst),
	"bdzl- $dst">;
	def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst),
	"bdnzl- $dst">;
	def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst),
	"bdzla- $dst">;
	def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst),
	"bdnzla- $dst">;
	}
	let Defs = [CTR], Uses = [CTR, LR, RM] in {
	def BDZLRL : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
	"bdzlrl", IIC_BrB, []>;
	def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
	"bdnzlrl", IIC_BrB, []>;
	def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
	"bdzlrl+", IIC_BrB, []>;
	def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
	"bdnzlrl+", IIC_BrB, []>;
	def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
	"bdzlrl-", IIC_BrB, []>;
	def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
	"bdnzlrl-", IIC_BrB, []>;
	}
	}

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
	def TCRETURNdi :Pseudo< (outs),
	(ins calltarget:$dst, i32imm:$offset),
	"#TC_RETURNd $dst $offset",
	[]>;


	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
	def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
	"#TC_RETURNa $func $offset",
	[(PPCtc_return (i32 imm:$func), imm:$offset)]>;

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
	def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
	"#TC_RETURNr $dst $offset",
	[]>;


	let isCodeGenOnly = 1 in {

	let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
	isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
	def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
	[]>, Requires<[In32BitMode]>;

	let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
	isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
	def TAILB : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
	"b $dst", IIC_BrB,
	[]>;

	let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
	isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
	def TAILBA : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
	"ba $dst", IIC_BrB,
	[]>;

	}

	let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
	let Defs = [CTR] in
	def EH_SjLj_SetJmp32 : Pseudo<(outs gprc:$dst), (ins memr:$buf),
	"#EH_SJLJ_SETJMP32",
	[(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
	Requires<[In32BitMode]>;
	let isTerminator = 1 in
	def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
	"#EH_SJLJ_LONGJMP32",
	[(PPCeh_sjlj_longjmp addr:$buf)]>,
	Requires<[In32BitMode]>;
	}

	// This pseudo is never removed from the function, as it serves as
	// a terminator. Size is set to 0 to prevent the builtin assembler
	// from emitting it.
	let isBranch = 1, isTerminator = 1, Size = 0 in {
	def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
	"#EH_SjLj_Setup\t$dst", []>;
	}

	// System call.
	let PPC970_Unit = 7 in {
	def SC : SCForm<17, 1, (outs), (ins i32imm:$lev),
	"sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>;
	}

	// Branch history rolling buffer.
	def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
	[(PPCclrbhrb)]>,
	PPC970_DGroup_Single;
	// The $dmy argument used for MFBHRBE is not needed; however, including
	// it avoids automatic generation of PPCFastISel::fastEmit_i(), which
	// interferes with necessary special handling (see PPCFastISel.cpp).
	def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$rD),
	(ins u10imm:$imm, u10imm:$dmy),
	"mfbhrbe $rD, $imm", IIC_BrB,
	[(set i32:$rD,
	(PPCmfbhrbe imm:$imm, imm:$dmy))]>,
	PPC970_DGroup_First;

	def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$imm), "rfebb $imm",
	IIC_BrB, [(PPCrfebb (i32 imm:$imm))]>,
	PPC970_DGroup_Single;

	// DCB* instructions.
	def DCBA : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
	IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
	PPC970_DGroup_Single;
	def DCBI : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst",
	IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
	PPC970_DGroup_Single;
	def DCBST : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst",
	IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
	PPC970_DGroup_Single;
	def DCBZ : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst",
	IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
	PPC970_DGroup_Single;
	def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
	IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
	PPC970_DGroup_Single;

	def DCBF : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst),
	"dcbf $dst, $TH", IIC_LdStDCBF, []>,
	PPC970_DGroup_Single;

	let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
	def DCBT : DCB_Form_hint<278, (outs), (ins u5imm:$TH, memrr:$dst),
	"dcbt $dst, $TH", IIC_LdStDCBF, []>,
	PPC970_DGroup_Single;
	def DCBTST : DCB_Form_hint<246, (outs), (ins u5imm:$TH, memrr:$dst),
	"dcbtst $dst, $TH", IIC_LdStDCBF, []>,
	PPC970_DGroup_Single;
	} // hasSideEffects = 0

	def ICBLC : XForm_icbt<31, 230, (outs), (ins u4imm:$CT, memrr:$src),
	"icblc $CT, $src", IIC_LdStStore>, Requires<[HasICBT]>;
	def ICBLQ : XForm_icbt<31, 198, (outs), (ins u4imm:$CT, memrr:$src),
	"icblq. $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;
	def ICBT : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src),
	"icbt $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;
	def ICBTLS : XForm_icbt<31, 486, (outs), (ins u4imm:$CT, memrr:$src),
	"icbtls $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;

	def : Pat<(int_ppc_dcbt xoaddr:$dst),
	(DCBT 0, xoaddr:$dst)>;
	def : Pat<(int_ppc_dcbtst xoaddr:$dst),
	(DCBTST 0, xoaddr:$dst)>;
	def : Pat<(int_ppc_dcbf xoaddr:$dst),
	(DCBF 0, xoaddr:$dst)>;

	def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
	(DCBT 0, xoaddr:$dst)>; // data prefetch for loads
	def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
	(DCBTST 0, xoaddr:$dst)>; // data prefetch for stores
	def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
	(ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)

	// Atomic operations
	// FIXME: some of these might be used with constant operands. This will result
	// in constant materialization instructions that may be redundant. We currently
	// clean this up in PPCMIPeephole with calls to
	// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
	// in the first place.
	let usesCustomInserter = 1 in {
	let Defs = [CR0] in {
	def ATOMIC_LOAD_ADD_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
	[(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_SUB_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
	[(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_AND_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
	[(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_OR_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
	[(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_XOR_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
	[(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_NAND_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
	[(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_MIN_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
	[(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_MAX_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
	[(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_UMIN_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
	[(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_UMAX_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
	[(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_ADD_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
	[(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_SUB_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
	[(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_AND_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
	[(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_OR_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
	[(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_XOR_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
	[(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_NAND_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
	[(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_MIN_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
	[(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_MAX_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
	[(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_UMIN_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
	[(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_UMAX_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
	[(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_ADD_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
	[(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_SUB_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
	[(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_AND_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
	[(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_OR_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
	[(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_XOR_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
	[(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_NAND_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
	[(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_MIN_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
	[(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_MAX_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
	[(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_UMIN_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
	[(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
	def ATOMIC_LOAD_UMAX_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
	[(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;

	def ATOMIC_CMP_SWAP_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
	[(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
	def ATOMIC_CMP_SWAP_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
	[(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
	def ATOMIC_CMP_SWAP_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
	[(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;

	def ATOMIC_SWAP_I8 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
	[(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
	def ATOMIC_SWAP_I16 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
	[(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
	def ATOMIC_SWAP_I32 : Pseudo<
	(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
	[(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
	}
	}
	+
	+def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new),
	+ (ATOMIC_CMP_SWAP_I8 xoaddr:$ptr, i32:$old, i32:$new)>;
	+def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new),
	+ (ATOMIC_CMP_SWAP_I16 xoaddr:$ptr, i32:$old, i32:$new)>;

	// Instructions to support atomic operations
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
	def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
	"lbarx $rD, $src", IIC_LdStLWARX, []>,
	Requires<[HasPartwordAtomics]>;

	def LHARX : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
	"lharx $rD, $src", IIC_LdStLWARX, []>,
	Requires<[HasPartwordAtomics]>;

	def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
	"lwarx $rD, $src", IIC_LdStLWARX, []>;

	// Instructions to support lock versions of atomics
	// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
	def LBARXL : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
	"lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
	Requires<[HasPartwordAtomics]>;

	def LHARXL : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src),
	"lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
	Requires<[HasPartwordAtomics]>;

	def LWARXL : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src),
	"lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;

	// The atomic instructions use the destination register as well as the next one
	// or two registers in order (modulo 31).
	let hasExtraSrcRegAllocReq = 1 in
	def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
	"lwat $rD, $rA, $FC", IIC_LdStLoad>,
	Requires<[IsISA3_0]>;
	}

	let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
	def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
	"stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
	isDOT, Requires<[HasPartwordAtomics]>;

	def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
	"sthcx. $rS, $dst", IIC_LdStSTWCX, []>,
	isDOT, Requires<[HasPartwordAtomics]>;

	def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
	"stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
	}

	let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
	def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
	"stwat $rS, $rA, $FC", IIC_LdStStore>,
	Requires<[IsISA3_0]>;

	let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
	def TRAP : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;

	def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
	"twi $to, $rA, $imm", IIC_IntTrapW, []>;
	def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
	"tw $to, $rA, $rB", IIC_IntTrapW, []>;
	def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
	"tdi $to, $rA, $imm", IIC_IntTrapD, []>;
	def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
	"td $to, $rA, $rB", IIC_IntTrapD, []>;

	//===----------------------------------------------------------------------===//
	// PPC32 Load Instructions.
	//

	// Unindexed (r+i) Loads.
	let PPC970_Unit = 2 in {
	def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
	"lbz $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (zextloadi8 iaddr:$src))]>;
	def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
	"lha $rD, $src", IIC_LdStLHA,
	[(set i32:$rD, (sextloadi16 iaddr:$src))]>,
	PPC970_DGroup_Cracked;
	def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
	"lhz $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (zextloadi16 iaddr:$src))]>;
	def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
	"lwz $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (load iaddr:$src))]>;

	def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
	"lfs $rD, $src", IIC_LdStLFD,
	[(set f32:$rD, (load iaddr:$src))]>;
	def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
	"lfd $rD, $src", IIC_LdStLFD,
	[(set f64:$rD, (load iaddr:$src))]>;


	// Unindexed (r+i) Loads with Update (preinc).
	let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
	def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
	"lbzu $rD, $addr", IIC_LdStLoadUpd,
	[]>, RegConstraint<"$addr.reg = $ea_result">,
	NoEncode<"$ea_result">;

	def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
	"lhau $rD, $addr", IIC_LdStLHAU,
	[]>, RegConstraint<"$addr.reg = $ea_result">,
	NoEncode<"$ea_result">;

	def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
	"lhzu $rD, $addr", IIC_LdStLoadUpd,
	[]>, RegConstraint<"$addr.reg = $ea_result">,
	NoEncode<"$ea_result">;

	def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
	"lwzu $rD, $addr", IIC_LdStLoadUpd,
	[]>, RegConstraint<"$addr.reg = $ea_result">,
	NoEncode<"$ea_result">;

	def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
	"lfsu $rD, $addr", IIC_LdStLFDU,
	[]>, RegConstraint<"$addr.reg = $ea_result">,
	NoEncode<"$ea_result">;

	def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
	"lfdu $rD, $addr", IIC_LdStLFDU,
	[]>, RegConstraint<"$addr.reg = $ea_result">,
	NoEncode<"$ea_result">;


	// Indexed (r+r) Loads with Update (preinc).
	def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
	(ins memrr:$addr),
	"lbzux $rD, $addr", IIC_LdStLoadUpdX,
	[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
	NoEncode<"$ea_result">;

	def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
	(ins memrr:$addr),
	"lhaux $rD, $addr", IIC_LdStLHAUX,
	[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
	NoEncode<"$ea_result">;

	def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
	(ins memrr:$addr),
	"lhzux $rD, $addr", IIC_LdStLoadUpdX,
	[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
	NoEncode<"$ea_result">;

	def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
	(ins memrr:$addr),
	"lwzux $rD, $addr", IIC_LdStLoadUpdX,
	[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
	NoEncode<"$ea_result">;

	def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
	(ins memrr:$addr),
	"lfsux $rD, $addr", IIC_LdStLFDUX,
	[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
	NoEncode<"$ea_result">;

	def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
	(ins memrr:$addr),
	"lfdux $rD, $addr", IIC_LdStLFDUX,
	[]>, RegConstraint<"$addr.ptrreg = $ea_result">,
	NoEncode<"$ea_result">;
	}
	}

	// Indexed (r+r) Loads.
	//
	let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
	def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src),
	"lbzx $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (zextloadi8 xaddr:$src))]>;
	def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
	"lhax $rD, $src", IIC_LdStLHA,
	[(set i32:$rD, (sextloadi16 xaddr:$src))]>,
	PPC970_DGroup_Cracked;
	def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
	"lhzx $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (zextloadi16 xaddr:$src))]>;
	def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src),
	"lwzx $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (load xaddr:$src))]>;
	def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
	"lhbrx $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
	def LWBRX : XForm_1<31, 534, (outs gprc:$rD), (ins memrr:$src),
	"lwbrx $rD, $src", IIC_LdStLoad,
	[(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;

	def LFSX : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
	"lfsx $frD, $src", IIC_LdStLFD,
	[(set f32:$frD, (load xaddr:$src))]>;
	def LFDX : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
	"lfdx $frD, $src", IIC_LdStLFD,
	[(set f64:$frD, (load xaddr:$src))]>;

	def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
	"lfiwax $frD, $src", IIC_LdStLFD,
	[(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
	def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
	"lfiwzx $frD, $src", IIC_LdStLFD,
	[(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
	}

	// Load Multiple
	def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
	"lmw $rD, $src", IIC_LdStLMW, []>;

	//===----------------------------------------------------------------------===//
	// PPC32 Store Instructions.
	//

	// Unindexed (r+i) Stores.
	let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
	def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
	"stb $rS, $src", IIC_LdStStore,
	[(truncstorei8 i32:$rS, iaddr:$src)]>;
	def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
	"sth $rS, $src", IIC_LdStStore,
	[(truncstorei16 i32:$rS, iaddr:$src)]>;
	def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
	"stw $rS, $src", IIC_LdStStore,
	[(store i32:$rS, iaddr:$src)]>;
	def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
	"stfs $rS, $dst", IIC_LdStSTFD,
	[(store f32:$rS, iaddr:$dst)]>;
	def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
	"stfd $rS, $dst", IIC_LdStSTFD,
	[(store f64:$rS, iaddr:$dst)]>;
	}

	// Unindexed (r+i) Stores with Update (preinc).
	let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
	def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
	"stbu $rS, $dst", IIC_LdStStoreUpd, []>,
	RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
	def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
	"sthu $rS, $dst", IIC_LdStStoreUpd, []>,
	RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
	def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
	"stwu $rS, $dst", IIC_LdStStoreUpd, []>,
	RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
	def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
	"stfsu $rS, $dst", IIC_LdStSTFDU, []>,
	RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
	def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
	"stfdu $rS, $dst", IIC_LdStSTFDU, []>,
	RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
	}

	// Patterns to match the pre-inc stores. We can't put the patterns on
	// the instruction definitions directly as ISel wants the address base
	// and offset to be separate operands, not a single complex operand.
	def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
	(STBU $rS, iaddroff:$ptroff, $ptrreg)>;
	def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
	(STHU $rS, iaddroff:$ptroff, $ptrreg)>;
	def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
	(STWU $rS, iaddroff:$ptroff, $ptrreg)>;
	def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
	(STFSU $rS, iaddroff:$ptroff, $ptrreg)>;
	def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
	(STFDU $rS, iaddroff:$ptroff, $ptrreg)>;

	// Indexed (r+r) Stores.
	let PPC970_Unit = 2 in {
	def STBX : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
	"stbx $rS, $dst", IIC_LdStStore,
	[(truncstorei8 i32:$rS, xaddr:$dst)]>,
	PPC970_DGroup_Cracked;
	def STHX : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
	"sthx $rS, $dst", IIC_LdStStore,
	[(truncstorei16 i32:$rS, xaddr:$dst)]>,
	PPC970_DGroup_Cracked;
	def STWX : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
	"stwx $rS, $dst", IIC_LdStStore,
	[(store i32:$rS, xaddr:$dst)]>,
	PPC970_DGroup_Cracked;

	def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
	"sthbrx $rS, $dst", IIC_LdStStore,
	[(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
	PPC970_DGroup_Cracked;
	def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
	"stwbrx $rS, $dst", IIC_LdStStore,
	[(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
	PPC970_DGroup_Cracked;

	def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
	"stfiwx $frS, $dst", IIC_LdStSTFD,
	[(PPCstfiwx f64:$frS, xoaddr:$dst)]>;

	def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
	"stfsx $frS, $dst", IIC_LdStSTFD,
	[(store f32:$frS, xaddr:$dst)]>;
	def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
	"stfdx $frS, $dst", IIC_LdStSTFD,
	[(store f64:$frS, xaddr:$dst)]>;
	}

	// Indexed (r+r) Stores with Update (preinc).
	let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
	def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
	"stbux $rS, $dst", IIC_LdStStoreUpd, []>,
	RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
	PPC970_DGroup_Cracked;
	def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
	"sthux $rS, $dst", IIC_LdStStoreUpd, []>,
	RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
	PPC970_DGroup_Cracked;
	def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
	"stwux $rS, $dst", IIC_LdStStoreUpd, []>,
	RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
	PPC970_DGroup_Cracked;
	def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
	"stfsux $rS, $dst", IIC_LdStSTFDU, []>,
	RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
	PPC970_DGroup_Cracked;
	def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
	"stfdux $rS, $dst", IIC_LdStSTFDU, []>,
	RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
	PPC970_DGroup_Cracked;
	}

	// Patterns to match the pre-inc stores. We can't put the patterns on
	// the instruction definitions directly as ISel wants the address base
	// and offset to be separate operands, not a single complex operand.
	def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
	(STBUX $rS, $ptrreg, $ptroff)>;
	def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
	(STHUX $rS, $ptrreg, $ptroff)>;
	def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
	(STWUX $rS, $ptrreg, $ptroff)>;
	def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
	(STFSUX $rS, $ptrreg, $ptroff)>;
	def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
	(STFDUX $rS, $ptrreg, $ptroff)>;

	// Store Multiple
	def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
	"stmw $rS, $dst", IIC_LdStLMW, []>;

	def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
	"sync $L", IIC_LdStSync, []>;

	let isCodeGenOnly = 1 in {
	def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
	"msync", IIC_LdStSync, []> {
	let L = 0;
	}
	}

	def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[HasSYNC]>;
	def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
	def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
	def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;

	//===----------------------------------------------------------------------===//
	// PPC32 Arithmetic Instructions.
	//

	let PPC970_Unit = 1 in { // FXU Operations.
	def ADDI : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
	"addi $rD, $rA, $imm", IIC_IntSimple,
	[(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
	let BaseName = "addic" in {
	let Defs = [CARRY] in
	def ADDIC : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
	"addic $rD, $rA, $imm", IIC_IntGeneral,
	[(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
	RecFormRel, PPC970_DGroup_Cracked;
	let Defs = [CARRY, CR0] in
	def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
	"addic. $rD, $rA, $imm", IIC_IntGeneral,
	[]>, isDOT, RecFormRel;
	}
	def ADDIS : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
	"addis $rD, $rA, $imm", IIC_IntSimple,
	[(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
	let isCodeGenOnly = 1 in
	def LA : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
	"la $rD, $sym($rA)", IIC_IntGeneral,
	[(set i32:$rD, (add i32:$rA,
	(PPClo tglobaladdr:$sym, 0)))]>;
	def MULLI : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
	"mulli $rD, $rA, $imm", IIC_IntMulLI,
	[(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
	let Defs = [CARRY] in
	def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
	"subfic $rD, $rA, $imm", IIC_IntGeneral,
	[(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;

	let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
	def LI : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
	"li $rD, $imm", IIC_IntSimple,
	[(set i32:$rD, imm32SExt16:$imm)]>;
	def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
	"lis $rD, $imm", IIC_IntSimple,
	[(set i32:$rD, imm16ShiftedSExt:$imm)]>;
	}
	}

	let PPC970_Unit = 1 in { // FXU Operations.
	let Defs = [CR0] in {
	def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
	"andi. $dst, $src1, $src2", IIC_IntGeneral,
	[(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
	isDOT;
	def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
	"andis. $dst, $src1, $src2", IIC_IntGeneral,
	[(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
	isDOT;
	}
	def ORI : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
	"ori $dst, $src1, $src2", IIC_IntSimple,
	[(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
	def ORIS : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
	"oris $dst, $src1, $src2", IIC_IntSimple,
	[(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
	def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
	"xori $dst, $src1, $src2", IIC_IntSimple,
	[(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
	def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
	"xoris $dst, $src1, $src2", IIC_IntSimple,
	[(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;

	def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
	[]>;
	let isCodeGenOnly = 1 in {
	// The POWER6 and POWER7 have special group-terminating nops.
	def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
	"ori 1, 1, 0", IIC_IntSimple, []>;
	def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
	"ori 2, 2, 0", IIC_IntSimple, []>;
	}

	let isCompare = 1, hasSideEffects = 0 in {
	def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
	"cmpwi $crD, $rA, $imm", IIC_IntCompare>;
	def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
	"cmplwi $dst, $src1, $src2", IIC_IntCompare>;
	def CMPRB : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
	(ins u1imm:$L, g8rc:$rA, g8rc:$rB),
	"cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
	Requires<[IsISA3_0]>;
	}
	}

	let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
	let isCommutable = 1 in {
	defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"nand", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
	defm AND : XForm_6r<31, 28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"and", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (and i32:$rS, i32:$rB))]>;
	} // isCommutable
	defm ANDC : XForm_6r<31, 60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"andc", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
	let isCommutable = 1 in {
	defm OR : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"or", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (or i32:$rS, i32:$rB))]>;
	defm NOR : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"nor", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
	} // isCommutable
	defm ORC : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"orc", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
	let isCommutable = 1 in {
	defm EQV : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"eqv", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
	defm XOR : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"xor", "$rA, $rS, $rB", IIC_IntSimple,
	[(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
	} // isCommutable
	defm SLW : XForm_6r<31, 24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"slw", "$rA, $rS, $rB", IIC_IntGeneral,
	[(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
	defm SRW : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"srw", "$rA, $rS, $rB", IIC_IntGeneral,
	[(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
	defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"sraw", "$rA, $rS, $rB", IIC_IntShift,
	[(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
	}

	let PPC970_Unit = 1 in { // FXU Operations.
	let hasSideEffects = 0 in {
	defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
	"srawi", "$rA, $rS, $SH", IIC_IntShift,
	[(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
	defm CNTLZW : XForm_11r<31, 26, (outs gprc:$rA), (ins gprc:$rS),
	"cntlzw", "$rA, $rS", IIC_IntGeneral,
	[(set i32:$rA, (ctlz i32:$rS))]>;
	defm CNTTZW : XForm_11r<31, 538, (outs gprc:$rA), (ins gprc:$rS),
	"cnttzw", "$rA, $rS", IIC_IntGeneral,
	[(set i32:$rA, (cttz i32:$rS))]>, Requires<[IsISA3_0]>;
	defm EXTSB : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
	"extsb", "$rA, $rS", IIC_IntSimple,
	[(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
	defm EXTSH : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
	"extsh", "$rA, $rS", IIC_IntSimple,
	[(set i32:$rA, (sext_inreg i32:$rS, i16))]>;

	let isCommutable = 1 in
	def CMPB : XForm_6<31, 508, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
	"cmpb $rA, $rS, $rB", IIC_IntGeneral,
	[(set i32:$rA, (PPCcmpb i32:$rS, i32:$rB))]>;
	}
	let isCompare = 1, hasSideEffects = 0 in {
	def CMPW : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
	"cmpw $crD, $rA, $rB", IIC_IntCompare>;
	def CMPLW : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
	"cmplw $crD, $rA, $rB", IIC_IntCompare>;
	}
	}
	let PPC970_Unit = 3 in { // FPU Operations.
	//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
	// "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
	let isCompare = 1, hasSideEffects = 0 in {
	def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
	"fcmpu $crD, $fA, $fB", IIC_FPCompare>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
	"fcmpu $crD, $fA, $fB", IIC_FPCompare>;
	}

	def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
	"ftdiv $crD, $fA, $fB", IIC_FPCompare>;
	def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
	"ftsqrt $crD, $fB", IIC_FPCompare>;

	let Uses = [RM] in {
	let hasSideEffects = 0 in {
	defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
	"fctiw", "$frD, $frB", IIC_FPGeneral,
	[]>;
	defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
	"fctiwu", "$frD, $frB", IIC_FPGeneral,
	[]>;
	defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
	"fctiwz", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (PPCfctiwz f64:$frB))]>;

	defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
	"frsp", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fpround f64:$frB))]>;

	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
	"frin", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (fround f64:$frB))]>;
	defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
	"frin", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fround f32:$frB))]>;
	}

	let hasSideEffects = 0 in {
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
	"frip", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (fceil f64:$frB))]>;
	defm FRIPS : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
	"frip", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fceil f32:$frB))]>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FRIZD : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
	"friz", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (ftrunc f64:$frB))]>;
	defm FRIZS : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
	"friz", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (ftrunc f32:$frB))]>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FRIMD : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
	"frim", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (ffloor f64:$frB))]>;
	defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
	"frim", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (ffloor f32:$frB))]>;

	defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
	"fsqrt", "$frD, $frB", IIC_FPSqrtD,
	[(set f64:$frD, (fsqrt f64:$frB))]>;
	defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
	"fsqrts", "$frD, $frB", IIC_FPSqrtS,
	[(set f32:$frD, (fsqrt f32:$frB))]>;
	}
	}
	}

	/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
	/// often coalesced away and we don't want the dispatch group builder to think
	/// that they will fill slots (which could cause the load of a LSU reject to
	/// sneak into a d-group with a store).
	let hasSideEffects = 0 in
	defm FMR : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
	"fmr", "$frD, $frB", IIC_FPGeneral,
	[]>, // (set f32:$frD, f32:$frB)
	PPC970_Unit_Pseudo;

	let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
	// These are artificially split into two different forms, for 4/8 byte FP.
	defm FABSS : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
	"fabs", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fabs f32:$frB))]>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FABSD : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB),
	"fabs", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (fabs f64:$frB))]>;
	defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB),
	"fnabs", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fneg (fabs f32:$frB)))]>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB),
	"fnabs", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (fneg (fabs f64:$frB)))]>;
	defm FNEGS : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB),
	"fneg", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fneg f32:$frB))]>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FNEGD : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
	"fneg", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (fneg f64:$frB))]>;

	defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
	"fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
	[(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
	"fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
	[(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;

	// Reciprocal estimates.
	defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
	"fre", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (PPCfre f64:$frB))]>;
	defm FRES : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB),
	"fres", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (PPCfre f32:$frB))]>;
	defm FRSQRTE : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB),
	"frsqrte", "$frD, $frB", IIC_FPGeneral,
	[(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
	defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
	"frsqrtes", "$frD, $frB", IIC_FPGeneral,
	[(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
	}

	// XL-Form instructions. condition register logical ops.
	//
	let hasSideEffects = 0 in
	def MCRF : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
	"mcrf $BF, $BFA", IIC_BrMCR>,
	PPC970_DGroup_First, PPC970_Unit_CRU;

	// FIXME: According to the ISA (section 2.5.1 of version 2.06), the
	// condition-register logical instructions have preferred forms. Specifically,
	// it is preferred that the bit specified by the BT field be in the same
	// condition register as that specified by the bit BB. We might want to account
	// for this via hinting the register allocator and anti-dep breakers, or we
	// could constrain the register class to force this constraint and then loosen
	// it during register allocation via convertToThreeAddress or some similar
	// mechanism.

	let isCommutable = 1 in {
	def CRAND : XLForm_1<19, 257, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"crand $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>;

	def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"crnand $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>;

	def CROR : XLForm_1<19, 449, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"cror $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>;

	def CRXOR : XLForm_1<19, 193, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"crxor $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>;

	def CRNOR : XLForm_1<19, 33, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"crnor $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>;

	def CREQV : XLForm_1<19, 289, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"creqv $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>;
	} // isCommutable

	def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"crandc $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>;

	def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD),
	(ins crbitrc:$CRA, crbitrc:$CRB),
	"crorc $CRD, $CRA, $CRB", IIC_BrCR,
	[(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;

	let isCodeGenOnly = 1 in {
	def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
	"creqv $dst, $dst, $dst", IIC_BrCR,
	[(set i1:$dst, 1)]>;

	def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
	"crxor $dst, $dst, $dst", IIC_BrCR,
	[(set i1:$dst, 0)]>;

	let Defs = [CR1EQ], CRD = 6 in {
	def CR6SET : XLForm_1_ext<19, 289, (outs), (ins),
	"creqv 6, 6, 6", IIC_BrCR,
	[(PPCcr6set)]>;

	def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
	"crxor 6, 6, 6", IIC_BrCR,
	[(PPCcr6unset)]>;
	}
	}

	// XFX-Form instructions. Instructions that deal with SPRs.
	//

	def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
	"mfspr $RT, $SPR", IIC_SprMFSPR>;
	def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
	"mtspr $SPR, $RT", IIC_SprMTSPR>;

	def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
	"mftb $RT, $SPR", IIC_SprMFTB>;

	def MFPMR : XFXForm_1<31, 334, (outs gprc:$RT), (ins i32imm:$SPR),
	"mfpmr $RT, $SPR", IIC_SprMFPMR>;

	def MTPMR : XFXForm_1<31, 462, (outs), (ins i32imm:$SPR, gprc:$RT),
	"mtpmr $SPR, $RT", IIC_SprMTPMR>;


	// A pseudo-instruction used to implement the read of the 64-bit cycle counter
	// on a 32-bit target.
	let hasSideEffects = 1, usesCustomInserter = 1 in
	def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins),
	"#ReadTB", []>;

	let Uses = [CTR] in {
	def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
	"mfctr $rT", IIC_SprMFSPR>,
	PPC970_DGroup_First, PPC970_Unit_FXU;
	}
	let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
	def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
	"mtctr $rS", IIC_SprMTSPR>,
	PPC970_DGroup_First, PPC970_Unit_FXU;
	}
	let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
	let Pattern = [(int_ppc_mtctr i32:$rS)] in
	def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
	"mtctr $rS", IIC_SprMTSPR>,
	PPC970_DGroup_First, PPC970_Unit_FXU;
	}

	let Defs = [LR] in {
	def MTLR : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
	"mtlr $rS", IIC_SprMTSPR>,
	PPC970_DGroup_First, PPC970_Unit_FXU;
	}
	let Uses = [LR] in {
	def MFLR : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
	"mflr $rT", IIC_SprMFSPR>,
	PPC970_DGroup_First, PPC970_Unit_FXU;
	}

	let isCodeGenOnly = 1 in {
	// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
	// like a GPR on the PPC970. As such, copies in and out have the same
	// performance characteristics as an OR instruction.
	def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
	"mtspr 256, $rS", IIC_IntGeneral>,
	PPC970_DGroup_Single, PPC970_Unit_FXU;
	def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
	"mfspr $rT, 256", IIC_IntGeneral>,
	PPC970_DGroup_First, PPC970_Unit_FXU;

	def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
	(outs VRSAVERC:$reg), (ins gprc:$rS),
	"mtspr 256, $rS", IIC_IntGeneral>,
	PPC970_DGroup_Single, PPC970_Unit_FXU;
	def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT),
	(ins VRSAVERC:$reg),
	"mfspr $rT, 256", IIC_IntGeneral>,
	PPC970_DGroup_First, PPC970_Unit_FXU;
	}

	// Aliases for mtvrsave/mfvrsave to mfspr/mtspr.
	def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
	def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;

	// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
	// so we'll need to scavenge a register for it.
	let mayStore = 1 in
	def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
	"#SPILL_VRSAVE", []>;

	// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
	// spilled), so we'll need to scavenge a register for it.
	let mayLoad = 1 in
	def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
	"#RESTORE_VRSAVE", []>;

	let hasSideEffects = 0 in {
	// mtocrf's input needs to be prepared by shifting by an amount dependent
	// on the cr register selected. Thus, post-ra anti-dep breaking must not
	// later change that register assignment.
	let hasExtraDefRegAllocReq = 1 in {
	def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
	"mtocrf $FXM, $ST", IIC_BrMCRX>,
	PPC970_DGroup_First, PPC970_Unit_CRU;

	// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
	// is dependent on the cr fields being set.
	def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
	"mtcrf $FXM, $rS", IIC_BrMCRX>,
	PPC970_MicroCode, PPC970_Unit_CRU;
	} // hasExtraDefRegAllocReq = 1

	// mfocrf's input needs to be prepared by shifting by an amount dependent
	// on the cr register selected. Thus, post-ra anti-dep breaking must not
	// later change that register assignment.
	let hasExtraSrcRegAllocReq = 1 in {
	def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
	"mfocrf $rT, $FXM", IIC_SprMFCRF>,
	PPC970_DGroup_First, PPC970_Unit_CRU;

	// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
	// is dependent on the cr fields being copied.
	def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
	"mfcr $rT", IIC_SprMFCR>,
	PPC970_MicroCode, PPC970_Unit_CRU;
	} // hasExtraSrcRegAllocReq = 1

	def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
	"mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
	} // hasSideEffects = 0

	// Pseudo instruction to perform FADD in round-to-zero mode.
	let usesCustomInserter = 1, Uses = [RM] in {
	def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
	[(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
	}

	// The above pseudo gets expanded to make use of the following instructions
	// to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level.
	let Uses = [RM], Defs = [RM] in {
	def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
	"mtfsb0 $FM", IIC_IntMTFSB0, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;
	def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
	"mtfsb1 $FM", IIC_IntMTFSB0, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;
	let isCodeGenOnly = 1 in
	def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
	"mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;
	}
	let Uses = [RM] in {
	def MFFS : XForm_42<63, 583, (outs f8rc:$rT), (ins),
	"mffs $rT", IIC_IntMFFS,
	[(set f64:$rT, (PPCmffs))]>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;

	let Defs = [CR1] in
	def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
	"mffs. $rT", IIC_IntMFFS, []>, isDOT;

	def MFFSCE : X_FRT5_XO2_XO3_XO10<63, 0, 1, 583, (outs f8rc:$rT), (ins),
	"mffsce $rT", IIC_IntMFFS, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;

	def MFFSCDRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 4, 583, (outs f8rc:$rT),
	(ins f8rc:$FRB), "mffscdrn $rT, $FRB",
	IIC_IntMFFS, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;

	def MFFSCDRNI : X_FRT5_XO2_XO3_DRM3_XO10<63, 2, 5, 583, (outs f8rc:$rT),
	(ins u3imm:$DRM),
	"mffscdrni $rT, $DRM",
	IIC_IntMFFS, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;

	def MFFSCRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 6, 583, (outs f8rc:$rT),
	(ins f8rc:$FRB), "mffscrn $rT, $FRB",
	IIC_IntMFFS, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;

	def MFFSCRNI : X_FRT5_XO2_XO3_RM2_X10<63, 2, 7, 583, (outs f8rc:$rT),
	(ins u2imm:$RM), "mffscrni $rT, $RM",
	IIC_IntMFFS, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;

	def MFFSL : X_FRT5_XO2_XO3_XO10<63, 3, 0, 583, (outs f8rc:$rT), (ins),
	"mffsl $rT", IIC_IntMFFS, []>,
	PPC970_DGroup_Single, PPC970_Unit_FPU;
	}

	let Predicates = [IsISA3_0] in {
	def MODSW : XForm_8<31, 779, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"modsw $rT, $rA, $rB", IIC_IntDivW,
	[(set i32:$rT, (srem i32:$rA, i32:$rB))]>;
	def MODUW : XForm_8<31, 267, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"moduw $rT, $rA, $rB", IIC_IntDivW,
	[(set i32:$rT, (urem i32:$rA, i32:$rB))]>;
	}

	let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
	// XO-Form instructions. Arithmetic instructions that can set overflow bit
	let isCommutable = 1 in
	defm ADD4 : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"add", "$rT, $rA, $rB", IIC_IntSimple,
	[(set i32:$rT, (add i32:$rA, i32:$rB))]>;
	let isCodeGenOnly = 1 in
	def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB),
	"add $rT, $rA, $rB", IIC_IntSimple,
	[(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>;
	let isCommutable = 1 in
	defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"addc", "$rT, $rA, $rB", IIC_IntGeneral,
	[(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
	PPC970_DGroup_Cracked;

	defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"divw", "$rT, $rA, $rB", IIC_IntDivW,
	[(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>;
	defm DIVWU : XOForm_1rcr<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"divwu", "$rT, $rA, $rB", IIC_IntDivW,
	[(set i32:$rT, (udiv i32:$rA, i32:$rB))]>;
	def DIVWE : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"divwe $rT, $rA, $rB", IIC_IntDivW,
	[(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>,
	Requires<[HasExtDiv]>;
	let Defs = [CR0] in
	def DIVWEo : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"divwe. $rT, $rA, $rB", IIC_IntDivW,
	[]>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
	Requires<[HasExtDiv]>;
	def DIVWEU : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"divweu $rT, $rA, $rB", IIC_IntDivW,
	[(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>,
	Requires<[HasExtDiv]>;
	let Defs = [CR0] in
	def DIVWEUo : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"divweu. $rT, $rA, $rB", IIC_IntDivW,
	[]>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
	Requires<[HasExtDiv]>;
	let isCommutable = 1 in {
	defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"mulhw", "$rT, $rA, $rB", IIC_IntMulHW,
	[(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
	defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
	[(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
	defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"mullw", "$rT, $rA, $rB", IIC_IntMulHW,
	[(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
	} // isCommutable
	defm SUBF : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"subf", "$rT, $rA, $rB", IIC_IntGeneral,
	[(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
	defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"subfc", "$rT, $rA, $rB", IIC_IntGeneral,
	[(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
	PPC970_DGroup_Cracked;
	defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA),
	"neg", "$rT, $rA", IIC_IntSimple,
	[(set i32:$rT, (ineg i32:$rA))]>;
	let Uses = [CARRY] in {
	let isCommutable = 1 in
	defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"adde", "$rT, $rA, $rB", IIC_IntGeneral,
	[(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
	defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA),
	"addme", "$rT, $rA", IIC_IntGeneral,
	[(set i32:$rT, (adde i32:$rA, -1))]>;
	defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA),
	"addze", "$rT, $rA", IIC_IntGeneral,
	[(set i32:$rT, (adde i32:$rA, 0))]>;
	defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
	"subfe", "$rT, $rA, $rB", IIC_IntGeneral,
	[(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
	defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA),
	"subfme", "$rT, $rA", IIC_IntGeneral,
	[(set i32:$rT, (sube -1, i32:$rA))]>;
	defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
	"subfze", "$rT, $rA", IIC_IntGeneral,
	[(set i32:$rT, (sube 0, i32:$rA))]>;
	}
	}

	// A-Form instructions. Most of the instructions executed in the FPU are of
	// this type.
	//
	let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations.
	let Uses = [RM] in {
	let isCommutable = 1 in {
	defm FMADD : AForm_1r<63, 29,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
	"fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
	[(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
	defm FMADDS : AForm_1r<59, 29,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
	"fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
	[(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
	defm FMSUB : AForm_1r<63, 28,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
	"fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
	[(set f64:$FRT,
	(fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
	defm FMSUBS : AForm_1r<59, 28,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
	"fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
	[(set f32:$FRT,
	(fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
	defm FNMADD : AForm_1r<63, 31,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
	"fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
	[(set f64:$FRT,
	(fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
	defm FNMADDS : AForm_1r<59, 31,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
	"fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
	[(set f32:$FRT,
	(fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
	defm FNMSUB : AForm_1r<63, 30,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
	"fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
	[(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
	(fneg f64:$FRB))))]>;
	defm FNMSUBS : AForm_1r<59, 30,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
	"fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
	[(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
	(fneg f32:$FRB))))]>;
	} // isCommutable
	}
	// FSEL is artificially split into 4 and 8-byte forms for the result. To avoid
	// having 4 of these, force the comparison to always be an 8-byte double (code
	// should use an FMRSD if the input comparison value really wants to be a float)
	// and 4/8 byte forms for the result and operand type..
	let Interpretation64Bit = 1, isCodeGenOnly = 1 in
	defm FSELD : AForm_1r<63, 23,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
	"fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
	[(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
	defm FSELS : AForm_1r<63, 23,
	(outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
	"fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
	[(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
	let Uses = [RM] in {
	let isCommutable = 1 in {
	defm FADD : AForm_2r<63, 21,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
	"fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
	[(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
	defm FADDS : AForm_2r<59, 21,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
	"fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
	[(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
	} // isCommutable
	defm FDIV : AForm_2r<63, 18,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
	"fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
	[(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
	defm FDIVS : AForm_2r<59, 18,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
	"fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
	[(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
	let isCommutable = 1 in {
	defm FMUL : AForm_3r<63, 25,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
	"fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
	[(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
	defm FMULS : AForm_3r<59, 25,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
	"fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
	[(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
	} // isCommutable
	defm FSUB : AForm_2r<63, 20,
	(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
	"fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
	[(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
	defm FSUBS : AForm_2r<59, 20,
	(outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
	"fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
	[(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
	}
	}

	let hasSideEffects = 0 in {
	let PPC970_Unit = 1 in { // FXU Operations.
	let isSelect = 1 in
	def ISEL : AForm_4<31, 15,
	(outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
	"isel $rT, $rA, $rB, $cond", IIC_IntISEL,
	[]>;
	}

	let PPC970_Unit = 1 in { // FXU Operations.
	// M-Form instructions. rotate and mask instructions.
	//
	let isCommutable = 1 in {
	// RLWIMI can be commuted if the rotate amount is zero.
	defm RLWIMI : MForm_2r<20, (outs gprc:$rA),
	(ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB,
	u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
	IIC_IntRotate, []>, PPC970_DGroup_Cracked,
	RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
	}
	let BaseName = "rlwinm" in {
	def RLWINM : MForm_2<21,
	(outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
	"rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
	[]>, RecFormRel;
	let Defs = [CR0] in
	def RLWINMo : MForm_2<21,
	(outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
	"rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
	[]>, isDOT, RecFormRel, PPC970_DGroup_Cracked;
	}
	defm RLWNM : MForm_2r<23, (outs gprc:$rA),
	(ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME),
	"rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
	[]>;
	}
	} // hasSideEffects = 0

	//===----------------------------------------------------------------------===//
	// PowerPC Instruction Patterns
	//

	// Arbitrary immediate support. Implement in terms of LIS/ORI.
	def : Pat<(i32 imm:$imm),
	(ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;

	// Implement the 'not' operation with the NOR instruction.
	def i32not : OutPatFrag<(ops node:$in),
	(NOR $in, $in)>;
	def : Pat<(not i32:$in),
	(i32not $in)>;

	// ADD an arbitrary immediate.
	def : Pat<(add i32:$in, imm:$imm),
	(ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
	// OR an arbitrary immediate.
	def : Pat<(or i32:$in, imm:$imm),
	(ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
	// XOR an arbitrary immediate.
	def : Pat<(xor i32:$in, imm:$imm),
	(XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
	// SUBFIC
	def : Pat<(sub imm32SExt16:$imm, i32:$in),
	(SUBFIC $in, imm:$imm)>;

	// SHL/SRL
	def : Pat<(shl i32:$in, (i32 imm:$imm)),
	(RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>;
	def : Pat<(srl i32:$in, (i32 imm:$imm)),
	(RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>;

	// ROTL
	def : Pat<(rotl i32:$in, i32:$sh),
	(RLWNM $in, $sh, 0, 31)>;
	def : Pat<(rotl i32:$in, (i32 imm:$imm)),
	(RLWINM $in, imm:$imm, 0, 31)>;

	// RLWNM
	def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
	(RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;

	// Calls
	def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
	(BL tglobaladdr:$dst)>;
	def : Pat<(PPCcall (i32 texternalsym:$dst)),
	(BL texternalsym:$dst)>;

	def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm),
	(TCRETURNdi tglobaladdr:$dst, imm:$imm)>;

	def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
	(TCRETURNdi texternalsym:$dst, imm:$imm)>;

	def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
	(TCRETURNri CTRRC:$dst, imm:$imm)>;



	// Hi and Lo for Darwin Global Addresses.
	def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
	def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
	def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
	def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
	def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
	def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
	def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
	def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
	def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in),
	(ADDIS $in, tglobaltlsaddr:$g)>;
	def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in),
	(ADDI $in, tglobaltlsaddr:$g)>;
	def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)),
	(ADDIS $in, tglobaladdr:$g)>;
	def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)),
	(ADDIS $in, tconstpool:$g)>;
	def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
	(ADDIS $in, tjumptable:$g)>;
	def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
	(ADDIS $in, tblockaddress:$g)>;

	// Support for thread-local storage.
	def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
	[(set i32:$rD, (PPCppc32GOT))]>;

	// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
	// This uses two output registers, the first as the real output, the second as a
	// temporary register, used internally in code generation.
	def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
	[]>, NoEncode<"$rT">;

	def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
	"#LDgotTprelL32",
	[(set i32:$rD,
	(PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
	def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
	(ADD4TLS $in, tglobaltlsaddr:$g)>;

	def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
	"#ADDItlsgdL32",
	[(set i32:$rD,
	(PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
	// LR is a true define, while the rest of the Defs are clobbers. R3 is
	// explicitly defined when this op is created, so not mentioned here.
	let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
	Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
	def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
	"GETtlsADDR32",
	[(set i32:$rD,
	(PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
	// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded. R3 and LR
	// are true defines while the rest of the Defs are clobbers.
	let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
	Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
	def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD),
	(ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
	"#ADDItlsgdLADDR32",
	[(set i32:$rD,
	(PPCaddiTlsgdLAddr i32:$reg,
	tglobaltlsaddr:$disp,
	tglobaltlsaddr:$sym))]>;
	def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
	"#ADDItlsldL32",
	[(set i32:$rD,
	(PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
	// LR is a true define, while the rest of the Defs are clobbers. R3 is
	// explicitly defined when this op is created, so not mentioned here.
	let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
	Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
	def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
	"GETtlsldADDR32",
	[(set i32:$rD,
	(PPCgetTlsldAddr i32:$reg,
	tglobaltlsaddr:$sym))]>;
	// Combined op for ADDItlsldL32 and GETtlsADDR32, late expanded. R3 and LR
	// are true defines while the rest of the Defs are clobbers.
	let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
	Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
	def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD),
	(ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
	"#ADDItlsldLADDR32",
	[(set i32:$rD,
	(PPCaddiTlsldLAddr i32:$reg,
	tglobaltlsaddr:$disp,
	tglobaltlsaddr:$sym))]>;
	def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
	"#ADDIdtprelL32",
	[(set i32:$rD,
	(PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
	def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
	"#ADDISdtprelHA32",
	[(set i32:$rD,
	(PPCaddisDtprelHA i32:$reg,
	tglobaltlsaddr:$disp))]>;

	// Support for Position-independent code
	def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
	"#LWZtoc",
	[(set i32:$rD,
	(PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
	// Get Global (GOT) Base Register offset, from the word immediately preceding
	// the function label.
	def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;


	// Standard shifts. These are represented separately from the real shifts above
	// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
	// amounts.
	def : Pat<(sra i32:$rS, i32:$rB),
	(SRAW $rS, $rB)>;
	def : Pat<(srl i32:$rS, i32:$rB),
	(SRW $rS, $rB)>;
	def : Pat<(shl i32:$rS, i32:$rB),
	(SLW $rS, $rB)>;

	def : Pat<(zextloadi1 iaddr:$src),
	(LBZ iaddr:$src)>;
	def : Pat<(zextloadi1 xaddr:$src),
	(LBZX xaddr:$src)>;
	def : Pat<(extloadi1 iaddr:$src),
	(LBZ iaddr:$src)>;
	def : Pat<(extloadi1 xaddr:$src),
	(LBZX xaddr:$src)>;
	def : Pat<(extloadi8 iaddr:$src),
	(LBZ iaddr:$src)>;
	def : Pat<(extloadi8 xaddr:$src),
	(LBZX xaddr:$src)>;
	def : Pat<(extloadi16 iaddr:$src),
	(LHZ iaddr:$src)>;
	def : Pat<(extloadi16 xaddr:$src),
	(LHZX xaddr:$src)>;
	def : Pat<(f64 (extloadf32 iaddr:$src)),
	(COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
	def : Pat<(f64 (extloadf32 xaddr:$src)),
	(COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;

	def : Pat<(f64 (fpextend f32:$src)),
	(COPY_TO_REGCLASS $src, F8RC)>;

	// Only seq_cst fences require the heavyweight sync (SYNC 0).
	// All others can use the lightweight sync (SYNC 1).
	// source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
	// The rule for seq_cst is duplicated to work with both 64 bits and 32 bits
	// versions of Power.
	def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
	def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
	def : Pat<(atomic_fence (imm), (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
	def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;

	// Additional FNMSUB patterns: -ac + b == -(ac - b)
	def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
	(FNMSUB $A, $C, $B)>;
	def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
	(FNMSUB $A, $C, $B)>;
	def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
	(FNMSUBS $A, $C, $B)>;
	def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
	(FNMSUBS $A, $C, $B)>;

	// FCOPYSIGN's operand types need not agree.
	def : Pat<(fcopysign f64:$frB, f32:$frA),
	(FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
	def : Pat<(fcopysign f32:$frB, f64:$frA),
	(FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;

	include "PPCInstrAltivec.td"
	include "PPCInstrSPE.td"
	include "PPCInstr64Bit.td"
	include "PPCInstrVSX.td"
	include "PPCInstrQPX.td"
	include "PPCInstrHTM.td"

	def crnot : OutPatFrag<(ops node:$in),
	(CRNOR $in, $in)>;
	def : Pat<(not i1:$in),
	(crnot $in)>;

	// Patterns for arithmetic i1 operations.
	def : Pat<(add i1:$a, i1:$b),
	(CRXOR $a, $b)>;
	def : Pat<(sub i1:$a, i1:$b),
	(CRXOR $a, $b)>;
	def : Pat<(mul i1:$a, i1:$b),
	(CRAND $a, $b)>;

	// We're sometimes asked to materialize i1 -1, which is just 1 in this case
	// (-1 is used to mean all bits set).
	def : Pat<(i1 -1), (CRSET)>;

	// i1 extensions, implemented in terms of isel.
	def : Pat<(i32 (zext i1:$in)),
	(SELECT_I4 $in, (LI 1), (LI 0))>;
	def : Pat<(i32 (sext i1:$in)),
	(SELECT_I4 $in, (LI -1), (LI 0))>;

	def : Pat<(i64 (zext i1:$in)),
	(SELECT_I8 $in, (LI8 1), (LI8 0))>;
	def : Pat<(i64 (sext i1:$in)),
	(SELECT_I8 $in, (LI8 -1), (LI8 0))>;

	// FIXME: We should choose either a zext or a sext based on other constants
	// already around.
	def : Pat<(i32 (anyext i1:$in)),
	(SELECT_I4 $in, (LI 1), (LI 0))>;
	def : Pat<(i64 (anyext i1:$in)),
	(SELECT_I8 $in, (LI8 1), (LI8 0))>;

	// match setcc on i1 variables.
	// CRANDC is:
	// 1 1 : F
	// 1 0 : T
	// 0 1 : F
	// 0 0 : F
	//
	// LT is:
	// -1 -1 : F
	// -1 0 : T
	// 0 -1 : F
	// 0 0 : F
	//
	// ULT is:
	// 1 1 : F
	// 1 0 : F
	// 0 1 : T
	// 0 0 : F
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)),
	(CRANDC $s1, $s2)>;
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)),
	(CRANDC $s2, $s1)>;
	// CRORC is:
	// 1 1 : T
	// 1 0 : T
	// 0 1 : F
	// 0 0 : T
	//
	// LE is:
	// -1 -1 : T
	// -1 0 : T
	// 0 -1 : F
	// 0 0 : T
	//
	// ULE is:
	// 1 1 : T
	// 1 0 : F
	// 0 1 : T
	// 0 0 : T
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)),
	(CRORC $s1, $s2)>;
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)),
	(CRORC $s2, $s1)>;

	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)),
	(CREQV $s1, $s2)>;

	// GE is:
	// -1 -1 : T
	// -1 0 : F
	// 0 -1 : T
	// 0 0 : T
	//
	// UGE is:
	// 1 1 : T
	// 1 0 : T
	// 0 1 : F
	// 0 0 : T
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)),
	(CRORC $s2, $s1)>;
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)),
	(CRORC $s1, $s2)>;

	// GT is:
	// -1 -1 : F
	// -1 0 : F
	// 0 -1 : T
	// 0 0 : F
	//
	// UGT is:
	// 1 1 : F
	// 1 0 : T
	// 0 1 : F
	// 0 0 : F
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)),
	(CRANDC $s2, $s1)>;
	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)),
	(CRANDC $s1, $s2)>;

	def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)),
	(CRXOR $s1, $s2)>;

	// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE,
	// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for
	// floating-point types.

	multiclass CRNotPat<dag pattern, dag result> {
	def : Pat<pattern, (crnot result)>;
	def : Pat<(not pattern), result>;

	// We can also fold the crnot into an extension:
	def : Pat<(i32 (zext pattern)),
	(SELECT_I4 result, (LI 0), (LI 1))>;
	def : Pat<(i32 (sext pattern)),
	(SELECT_I4 result, (LI 0), (LI -1))>;

	// We can also fold the crnot into an extension:
	def : Pat<(i64 (zext pattern)),
	(SELECT_I8 result, (LI8 0), (LI8 1))>;
	def : Pat<(i64 (sext pattern)),
	(SELECT_I8 result, (LI8 0), (LI8 -1))>;

	// FIXME: We should choose either a zext or a sext based on other constants
	// already around.
	def : Pat<(i32 (anyext pattern)),
	(SELECT_I4 result, (LI 0), (LI 1))>;

	def : Pat<(i64 (anyext pattern)),
	(SELECT_I8 result, (LI8 0), (LI8 1))>;
	}

	// FIXME: Because of what seems like a bug in TableGen's type-inference code,
	// we need to write imm:$imm in the output patterns below, not just $imm, or
	// else the resulting matcher will not correctly add the immediate operand
	// (making it a register operand instead).

	// extended SETCC.
	multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag,
	OutPatFrag rfrag, OutPatFrag rfrag8> {
	def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))),
	(rfrag $s1)>;
	def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))),
	(rfrag8 $s1)>;
	def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
	def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))),
	(EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;

	def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))),
	(rfrag $s1)>;
	def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))),
	(rfrag8 $s1)>;
	def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
	def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))),
	(EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
	}

	// Note that we do all inversions below with i(32\|64)not, instead of using
	// (xori x, 1) because on the A2 nor has single-cycle latency while xori
	// has 2-cycle latency.

	defm : ExtSetCCPat<SETEQ,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, 0, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (CNTLZW $in), 27, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (CNTLZD $in), 58, 63)> >;

	defm : ExtSetCCPat<SETNE,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, 0, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (i64not (CNTLZD $in)), 58, 63)> >;

	defm : ExtSetCCPat<SETLT,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, 0, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM $in, 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL $in, 1, 63)> >;

	defm : ExtSetCCPat<SETGE,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, 0, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (i32not $in), 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (i64not $in), 1, 63)> >;

	defm : ExtSetCCPat<SETGT,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, 0, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >;

	defm : ExtSetCCPat<SETLE,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, 0, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >;

	defm : ExtSetCCPat<SETLT,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, -1, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >;

	defm : ExtSetCCPat<SETGE,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, -1, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >;

	defm : ExtSetCCPat<SETGT,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, -1, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM (i32not $in), 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL (i64not $in), 1, 63)> >;

	defm : ExtSetCCPat<SETLE,
	PatFrag<(ops node:$in, node:$cc),
	(setcc $in, -1, $cc)>,
	OutPatFrag<(ops node:$in),
	(RLWINM $in, 1, 31, 31)>,
	OutPatFrag<(ops node:$in),
	(RLDICL $in, 1, 63)> >;

	// An extended SETCC with shift amount.
	multiclass ExtSetCCShiftPat<CondCode cc, PatFrag pfrag,
	OutPatFrag rfrag, OutPatFrag rfrag8> {
	def : Pat<(i32 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
	(rfrag $s1, $sa)>;
	def : Pat<(i64 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
	(rfrag8 $s1, $sa)>;
	def : Pat<(i64 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
	def : Pat<(i32 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
	(EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;

	def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
	(rfrag $s1, $sa)>;
	def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
	(rfrag8 $s1, $sa)>;
	def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
	def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
	(EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
	}

	defm : ExtSetCCShiftPat<SETNE,
	PatFrag<(ops node:$in, node:$sa, node:$cc),
	(setcc (and $in, (shl 1, $sa)), 0, $cc)>,
	OutPatFrag<(ops node:$in, node:$sa),
	(RLWNM $in, (SUBFIC $sa, 32), 31, 31)>,
	OutPatFrag<(ops node:$in, node:$sa),
	(RLDCL $in, (SUBFIC $sa, 64), 63)> >;

	defm : ExtSetCCShiftPat<SETEQ,
	PatFrag<(ops node:$in, node:$sa, node:$cc),
	(setcc (and $in, (shl 1, $sa)), 0, $cc)>,
	OutPatFrag<(ops node:$in, node:$sa),
	(RLWNM (i32not $in),
	(SUBFIC $sa, 32), 31, 31)>,
	OutPatFrag<(ops node:$in, node:$sa),
	(RLDCL (i64not $in),
	(SUBFIC $sa, 64), 63)> >;

	// SETCC for i32.
	def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)),
	(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
	def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)),
	(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
	def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)),
	(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
	def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)),
	(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
	def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)),
	(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
	def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)),
	(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;

	// For non-equality comparisons, the default code would materialize the
	// constant, then compare against it, like this:
	// lis r2, 4660
	// ori r2, r2, 22136
	// cmpw cr0, r3, r2
	// beq cr0,L6
	// Since we are just comparing for equality, we can emit this instead:
	// xoris r0,r3,0x1234
	// cmplwi cr0,r0,0x5678
	// beq cr0,L6

	def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
	(EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
	(LO16 imm:$imm)), sub_eq)>;

	defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
	(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
	(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
	(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
	(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
	(EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
	(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;

	defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
	(EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
	(LO16 imm:$imm)), sub_eq)>;

	def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
	(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
	(EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)),
	(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
	(EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
	(EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;

	defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
	(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
	(EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
	(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
	(EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
	(EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;

	// SETCC for i64.
	def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
	(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
	def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)),
	(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
	def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)),
	(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
	def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)),
	(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
	def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)),
	(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
	def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)),
	(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;

	// For non-equality comparisons, the default code would materialize the
	// constant, then compare against it, like this:
	// lis r2, 4660
	// ori r2, r2, 22136
	// cmpd cr0, r3, r2
	// beq cr0,L6
	// Since we are just comparing for equality, we can emit this instead:
	// xoris r0,r3,0x1234
	// cmpldi cr0,r0,0x5678
	// beq cr0,L6

	def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
	(EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
	(LO16 imm:$imm)), sub_eq)>;

	defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
	(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
	(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)),
	(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)),
	(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)),
	(EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)),
	(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;

	defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
	(EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
	(LO16 imm:$imm)), sub_eq)>;

	def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
	(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
	(EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
	(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
	(EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
	(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;

	defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
	(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
	(EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)),
	(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
	(EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
	(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;

	// SETCC for f32.
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
	def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;

	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
	defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
	(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;

	// SETCC for f64.
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
	def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;

	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
	defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
	(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;

	// match select on i1 variables:
	def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
	(CROR (CRAND $cond , $tval),
	(CRAND (crnot $cond), $fval))>;

	// match selectcc on i1 variables:
	// select (lhs == rhs), tval, fval is:
	// ((lhs == rhs) & tval) \| (!(lhs == rhs) & fval)
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)),
	(CROR (CRAND (CRANDC $lhs, $rhs), $tval),
	(CRAND (CRORC $rhs, $lhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULT)),
	(CROR (CRAND (CRANDC $rhs, $lhs), $tval),
	(CRAND (CRORC $lhs, $rhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)),
	(CROR (CRAND (CRORC $lhs, $rhs), $tval),
	(CRAND (CRANDC $rhs, $lhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULE)),
	(CROR (CRAND (CRORC $rhs, $lhs), $tval),
	(CRAND (CRANDC $lhs, $rhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)),
	(CROR (CRAND (CREQV $lhs, $rhs), $tval),
	(CRAND (CRXOR $lhs, $rhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)),
	(CROR (CRAND (CRORC $rhs, $lhs), $tval),
	(CRAND (CRANDC $lhs, $rhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGE)),
	(CROR (CRAND (CRORC $lhs, $rhs), $tval),
	(CRAND (CRANDC $rhs, $lhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)),
	(CROR (CRAND (CRANDC $rhs, $lhs), $tval),
	(CRAND (CRORC $lhs, $rhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGT)),
	(CROR (CRAND (CRANDC $lhs, $rhs), $tval),
	(CRAND (CRORC $rhs, $lhs), $fval))>;
	def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)),
	(CROR (CRAND (CREQV $lhs, $rhs), $fval),
	(CRAND (CRXOR $lhs, $rhs), $tval))>;

	// match selectcc on i1 variables with non-i1 output.
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)),
	(SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULT)),
	(SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)),
	(SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULE)),
	(SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)),
	(SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)),
	(SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGE)),
	(SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)),
	(SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGT)),
	(SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)),
	(SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>;

	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)),
	(SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULT)),
	(SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)),
	(SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULE)),
	(SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)),
	(SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)),
	(SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGE)),
	(SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)),
	(SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)),
	(SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
	(SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;

	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
	(SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
	(SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
	(SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
	(SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
	(SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
	(SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
	(SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
	(SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
	(SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
	(SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>;

	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
	(SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
	(SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
	(SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
	(SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
	(SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
	(SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
	(SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
	(SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
	(SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
	(SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;

	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
	(SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULT)),
	(SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)),
	(SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULE)),
	(SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)),
	(SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)),
	(SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGE)),
	(SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)),
	(SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)),
	(SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
	def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
	(SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;

	let usesCustomInserter = 1 in {
	def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
	"#ANDIo_1_EQ_BIT",
	[(set i1:$dst, (trunc (not i32:$in)))]>;
	def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
	"#ANDIo_1_GT_BIT",
	[(set i1:$dst, (trunc i32:$in))]>;

	def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
	"#ANDIo_1_EQ_BIT8",
	[(set i1:$dst, (trunc (not i64:$in)))]>;
	def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
	"#ANDIo_1_GT_BIT8",
	[(set i1:$dst, (trunc i64:$in))]>;
	}

	def : Pat<(i1 (not (trunc i32:$in))),
	(ANDIo_1_EQ_BIT $in)>;
	def : Pat<(i1 (not (trunc i64:$in))),
	(ANDIo_1_EQ_BIT8 $in)>;

	//===----------------------------------------------------------------------===//
	// PowerPC Instructions used for assembler/disassembler only
	//

	// FIXME: For B=0 or B > 8, the registers following RT are used.
	// WARNING: Do not add patterns for this instruction without fixing this.
	def LSWI : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
	"lswi $RT, $A, $B", IIC_LdStLoad, []>;

	// FIXME: For B=0 or B > 8, the registers following RT are used.
	// WARNING: Do not add patterns for this instruction without fixing this.
	def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
	"stswi $RT, $A, $B", IIC_LdStLoad, []>;

	def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
	"isync", IIC_SprISYNC, []>;

	def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
	"icbi $src", IIC_LdStICBI, []>;

	// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
	def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
	"eieio", IIC_LdStLoad, []>;

	def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
	"wait $L", IIC_LdStLoad, []>;

	def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
	"mbar $MO", IIC_LdStLoad>, Requires<[IsBookE]>;

	def MTSR: XForm_sr<31, 210, (outs), (ins gprc:$RS, u4imm:$SR),
	"mtsr $SR, $RS", IIC_SprMTSR>;

	def MFSR: XForm_sr<31, 595, (outs gprc:$RS), (ins u4imm:$SR),
	"mfsr $RS, $SR", IIC_SprMFSR>;

	def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
	"mtsrin $RS, $RB", IIC_SprMTSR>;

	def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
	"mfsrin $RS, $RB", IIC_SprMFSR>;

	def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
	"mtmsr $RS, $L", IIC_SprMTMSR>;

	def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
	"wrtee $RS", IIC_SprMTMSR>, Requires<[IsBookE]> {
	let L = 0;
	}

	def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
	Requires<[IsBookE]> {
	bits<1> E;

	let Inst{16} = E;
	let Inst{21-30} = 163;
	}

	def DCCCI : XForm_tlb<454, (outs), (ins gprc:$A, gprc:$B),
	"dccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
	def ICCCI : XForm_tlb<966, (outs), (ins gprc:$A, gprc:$B),
	"iccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;

	def : InstAlias<"dci 0", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"dccci", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"ici 0", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;

	def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
	"mfmsr $RT", IIC_SprMFMSR, []>;

	def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
	"mtmsrd $RS, $L", IIC_SprMTMSRD>;

	def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
	"mcrfs $BF, $BFA", IIC_BrMCR>;

	def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
	"mtfsfi $BF, $U, $W", IIC_IntMFFS>;

	def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
	"mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT;

	def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
	def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;

	def MTFSF : XFLForm_1<63, 711, (outs),
	(ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
	"mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
	def MTFSFo : XFLForm_1<63, 711, (outs),
	(ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
	"mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT;

	def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
	def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;

	def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
	"slbie $RB", IIC_SprSLBIE, []>;

	def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
	"slbmte $RS, $RB", IIC_SprSLBMTE, []>;

	def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
	"slbmfee $RT, $RB", IIC_SprSLBMFEE, []>;

	def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB),
	"slbmfev $RT, $RB", IIC_SprSLBMFEV, []>;

	def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;

	def TLBIA : XForm_0<31, 370, (outs), (ins),
	"tlbia", IIC_SprTLBIA, []>;

	def TLBSYNC : XForm_0<31, 566, (outs), (ins),
	"tlbsync", IIC_SprTLBSYNC, []>;

	def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
	"tlbiel $RB", IIC_SprTLBIEL, []>;

	def TLBLD : XForm_16b<31, 978, (outs), (ins gprc:$RB),
	"tlbld $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
	def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB),
	"tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;

	def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
	"tlbie $RB,$RS", IIC_SprTLBIE, []>;

	def TLBSX : XForm_tlb<914, (outs), (ins gprc:$A, gprc:$B), "tlbsx $A, $B",
	IIC_LdStLoad>, Requires<[IsBookE]>;

	def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$A, gprc:$B), "tlbivax $A, $B",
	IIC_LdStLoad>, Requires<[IsBookE]>;

	def TLBRE : XForm_24_eieio<31, 946, (outs), (ins),
	"tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>;

	def TLBWE : XForm_24_eieio<31, 978, (outs), (ins),
	"tlbwe", IIC_LdStLoad, []>, Requires<[IsBookE]>;

	def TLBRE2 : XForm_tlbws<31, 946, (outs gprc:$RS), (ins gprc:$A, i1imm:$WS),
	"tlbre $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;

	def TLBWE2 : XForm_tlbws<31, 978, (outs), (ins gprc:$RS, gprc:$A, i1imm:$WS),
	"tlbwe $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;

	def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
	"tlbsx $RST, $A, $B", IIC_LdStLoad, []>,
	Requires<[IsPPC4xx]>;
	def TLBSX2D : XForm_base_r3xo<31, 914, (outs),
	(ins gprc:$RST, gprc:$A, gprc:$B),
	"tlbsx. $RST, $A, $B", IIC_LdStLoad, []>,
	Requires<[IsPPC4xx]>, isDOT;

	def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>;

	def RFI : XForm_0<19, 50, (outs), (ins), "rfi", IIC_SprRFI, []>,
	Requires<[IsBookE]>;
	def RFCI : XForm_0<19, 51, (outs), (ins), "rfci", IIC_BrB, []>,
	Requires<[IsBookE]>;

	def RFDI : XForm_0<19, 39, (outs), (ins), "rfdi", IIC_BrB, []>,
	Requires<[IsE500]>;
	def RFMCI : XForm_0<19, 38, (outs), (ins), "rfmci", IIC_BrB, []>,
	Requires<[IsE500]>;

	def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR),
	"mfdcr $RT, $SPR", IIC_SprMFSPR>, Requires<[IsPPC4xx]>;
	def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR),
	"mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>;

	def HRFID : XLForm_1_np<19, 274, (outs), (ins), "hrfid", IIC_BrB, []>;
	def NAP : XLForm_1_np<19, 434, (outs), (ins), "nap", IIC_BrB, []>;

	def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;

	def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
	"lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
	def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
	"lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
	def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
	"lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
	def LDCIX : XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
	"ldcix $RST, $A, $B", IIC_LdStLoad, []>;

	def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
	"stbcix $RST, $A, $B", IIC_LdStLoad, []>;
	def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
	"sthcix $RST, $A, $B", IIC_LdStLoad, []>;
	def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
	"stwcix $RST, $A, $B", IIC_LdStLoad, []>;
	def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
	"stdcix $RST, $A, $B", IIC_LdStLoad, []>;

	// External PID Load Store Instructions

	def LBEPX : XForm_1<31, 95, (outs gprc:$rD), (ins memrr:$src),
	"lbepx $rD, $src", IIC_LdStLoad, []>,
	Requires<[IsE500]>;

	def LFDEPX : XForm_25<31, 607, (outs f8rc:$frD), (ins memrr:$src),
	"lfdepx $frD, $src", IIC_LdStLFD, []>,
	Requires<[IsE500]>;

	def LHEPX : XForm_1<31, 287, (outs gprc:$rD), (ins memrr:$src),
	"lhepx $rD, $src", IIC_LdStLoad, []>,
	Requires<[IsE500]>;

	def LWEPX : XForm_1<31, 31, (outs gprc:$rD), (ins memrr:$src),
	"lwepx $rD, $src", IIC_LdStLoad, []>,
	Requires<[IsE500]>;

	def STBEPX : XForm_8<31, 223, (outs), (ins gprc:$rS, memrr:$dst),
	"stbepx $rS, $dst", IIC_LdStStore, []>,
	Requires<[IsE500]>;

	def STFDEPX : XForm_28<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
	"stfdepx $frS, $dst", IIC_LdStSTFD, []>,
	Requires<[IsE500]>;

	def STHEPX : XForm_8<31, 415, (outs), (ins gprc:$rS, memrr:$dst),
	"sthepx $rS, $dst", IIC_LdStStore, []>,
	Requires<[IsE500]>;

	def STWEPX : XForm_8<31, 159, (outs), (ins gprc:$rS, memrr:$dst),
	"stwepx $rS, $dst", IIC_LdStStore, []>,
	Requires<[IsE500]>;

	def DCBFEP : DCB_Form<127, 0, (outs), (ins memrr:$dst), "dcbfep $dst",
	IIC_LdStDCBF, []>, Requires<[IsE500]>;

	def DCBSTEP : DCB_Form<63, 0, (outs), (ins memrr:$dst), "dcbstep $dst",
	IIC_LdStDCBF, []>, Requires<[IsE500]>;

	def DCBTEP : DCB_Form_hint<319, (outs), (ins memrr:$dst, u5imm:$TH),
	"dcbtep $TH, $dst", IIC_LdStDCBF, []>,
	Requires<[IsE500]>;

	def DCBTSTEP : DCB_Form_hint<255, (outs), (ins memrr:$dst, u5imm:$TH),
	"dcbtstep $TH, $dst", IIC_LdStDCBF, []>,
	Requires<[IsE500]>;

	def DCBZEP : DCB_Form<1023, 0, (outs), (ins memrr:$dst), "dcbzep $dst",
	IIC_LdStDCBF, []>, Requires<[IsE500]>;

	def DCBZLEP : DCB_Form<1023, 1, (outs), (ins memrr:$dst), "dcbzlep $dst",
	IIC_LdStDCBF, []>, Requires<[IsE500]>;

	def ICBIEP : XForm_1a<31, 991, (outs), (ins memrr:$src), "icbiep $src",
	IIC_LdStICBI, []>, Requires<[IsE500]>;

	//===----------------------------------------------------------------------===//
	// PowerPC Assembler Instruction Aliases
	//

	// Pseudo-instructions for alternate assembly syntax (never used by codegen).
	// These are aliases that require C++ handling to convert to the target
	// instruction, while InstAliases can be handled directly by tblgen.
	class PPCAsmPseudo<string asm, dag iops>
	: Instruction {
	let Namespace = "PPC";
	bit PPC64 = 0; // Default value, override with isPPC64

	let OutOperandList = (outs);
	let InOperandList = iops;
	let Pattern = [];
	let AsmString = asm;
	let isAsmParserOnly = 1;
	let isPseudo = 1;
	let hasNoSchedulingInfo = 1;
	}

	def : InstAlias<"sc", (SC 0)>;

	def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>;
	def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>;
	def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>;
	def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>;

	def : InstAlias<"wait", (WAIT 0)>;
	def : InstAlias<"waitrsv", (WAIT 1)>;
	def : InstAlias<"waitimpl", (WAIT 2)>;

	def : InstAlias<"mbar", (MBAR 0)>, Requires<[IsBookE]>;

	def DCBTx : PPCAsmPseudo<"dcbt $dst", (ins memrr:$dst)>;
	def DCBTSTx : PPCAsmPseudo<"dcbtst $dst", (ins memrr:$dst)>;

	def DCBTCT : PPCAsmPseudo<"dcbtct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
	def DCBTDS : PPCAsmPseudo<"dcbtds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
	def DCBTT : PPCAsmPseudo<"dcbtt $dst", (ins memrr:$dst)>;

	def DCBTSTCT : PPCAsmPseudo<"dcbtstct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
	def DCBTSTDS : PPCAsmPseudo<"dcbtstds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
	def DCBTSTT : PPCAsmPseudo<"dcbtstt $dst", (ins memrr:$dst)>;

	def DCBFx : PPCAsmPseudo<"dcbf $dst", (ins memrr:$dst)>;
	def DCBFL : PPCAsmPseudo<"dcbfl $dst", (ins memrr:$dst)>;
	def DCBFLP : PPCAsmPseudo<"dcbflp $dst", (ins memrr:$dst)>;

	def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
	def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
	def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
	def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;

	def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
	def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;

	def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
	def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;

	def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>;
	def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>;

	def : InstAlias<"mtdsisr $Rx", (MTSPR 18, gprc:$Rx)>;
	def : InstAlias<"mfdsisr $Rx", (MFSPR gprc:$Rx, 18)>;

	def : InstAlias<"mtdar $Rx", (MTSPR 19, gprc:$Rx)>;
	def : InstAlias<"mfdar $Rx", (MFSPR gprc:$Rx, 19)>;

	def : InstAlias<"mtdec $Rx", (MTSPR 22, gprc:$Rx)>;
	def : InstAlias<"mfdec $Rx", (MFSPR gprc:$Rx, 22)>;

	def : InstAlias<"mtsdr1 $Rx", (MTSPR 25, gprc:$Rx)>;
	def : InstAlias<"mfsdr1 $Rx", (MFSPR gprc:$Rx, 25)>;

	def : InstAlias<"mtsrr0 $Rx", (MTSPR 26, gprc:$Rx)>;
	def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>;

	def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>;
	def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>;

	def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>;
	def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>;

	def : InstAlias<"mtamr $Rx", (MTSPR 29, gprc:$Rx)>;
	def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;

	def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
	def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;

	def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
	def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
	def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;

	def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
	def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;

	def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"xnop", (XORI R0, R0, 0)>;

	def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
	def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;

	def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
	def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;

	def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;

	foreach BATR = 0-3 in {
	def : InstAlias<"mtdbatu "#BATR#", $Rx",
	(MTSPR !add(BATR, !add(BATR, 536)), gprc:$Rx)>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mfdbatu $Rx, "#BATR,
	(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 536)))>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mtdbatl "#BATR#", $Rx",
	(MTSPR !add(BATR, !add(BATR, 537)), gprc:$Rx)>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mfdbatl $Rx, "#BATR,
	(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 537)))>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mtibatu "#BATR#", $Rx",
	(MTSPR !add(BATR, !add(BATR, 528)), gprc:$Rx)>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mfibatu $Rx, "#BATR,
	(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 528)))>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mtibatl "#BATR#", $Rx",
	(MTSPR !add(BATR, !add(BATR, 529)), gprc:$Rx)>,
	Requires<[IsPPC6xx]>;
	def : InstAlias<"mfibatl $Rx, "#BATR,
	(MFSPR gprc:$Rx, !add(BATR, !add(BATR, 529)))>,
	Requires<[IsPPC6xx]>;
	}

	foreach BR = 0-7 in {
	def : InstAlias<"mfbr"#BR#" $Rx",
	(MFDCR gprc:$Rx, !add(BR, 0x80))>,
	Requires<[IsPPC4xx]>;
	def : InstAlias<"mtbr"#BR#" $Rx",
	(MTDCR gprc:$Rx, !add(BR, 0x80))>,
	Requires<[IsPPC4xx]>;
	}

	def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>;

	def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
	def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;

	def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>;
	def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>;

	def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;

	def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
	(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
	def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
	(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
	def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
	(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
	def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm",
	(ins gprc:$rA, gprc:$rB, s16imm:$imm)>;

	def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
	def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
	def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
	def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;

	def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
	def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;

	def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
	def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;

	foreach SPRG = 0-3 in {
	def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
	def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
	def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
	def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
	}
	foreach SPRG = 4-7 in {
	def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
	Requires<[IsBookE]>;
	def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
	Requires<[IsBookE]>;
	def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
	Requires<[IsBookE]>;
	def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
	Requires<[IsBookE]>;
	}

	def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;

	def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
	def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;

	def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;

	def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
	def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;

	def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
	def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
	def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
	def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;

	def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;

	def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>,
	Requires<[IsPPC4xx]>;
	def : InstAlias<"tlbrelo $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 1)>,
	Requires<[IsPPC4xx]>;
	def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
	Requires<[IsPPC4xx]>;
	def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
	Requires<[IsPPC4xx]>;

	def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b",
	(ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
	def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$n)>;
	def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
	def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
	(ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;

	def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
	def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
	def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
	def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
	def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
	def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;

	def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
	def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
	// The POWER variant
	def : MnemonicAlias<"cntlz", "cntlzw">;
	def : MnemonicAlias<"cntlz.", "cntlzw.">;

	def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
	def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
	def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
	def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
	def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
	def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
	def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
	def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
	def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
	(ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
	def SUBPCIS : PPCAsmPseudo<"subpcis $RT, $D", (ins g8rc:$RT, s16imm:$D)>;

	def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
	def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
	def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
	def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
	def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
	def : InstAlias<"clrldi $rA, $rS, $n",
	(RLDICL_32_64 g8rc:$rA, gprc:$rS, 0, u6imm:$n)>;
	def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
	def : InstAlias<"lnia $RT", (ADDPCIS g8rc:$RT, 0)>;

	def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
	def RLWINMobm : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
	def RLWIMIbm : PPCAsmPseudo<"rlwimi $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
	def RLWIMIobm : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
	def RLWNMbm : PPCAsmPseudo<"rlwnm $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
	def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
	(ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;

	// These generic branch instruction forms are used for the assembler parser only.
	// Defs and Uses are conservative, since we don't know the BO value.
	let PPC970_Unit = 7, isBranch = 1 in {
	let Defs = [CTR], Uses = [CTR, RM] in {
	def gBC : BForm_3<16, 0, 0, (outs),
	(ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
	"bc $bo, $bi, $dst">;
	def gBCA : BForm_3<16, 1, 0, (outs),
	(ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
	"bca $bo, $bi, $dst">;
	let isAsmParserOnly = 1 in {
	def gBCat : BForm_3_at<16, 0, 0, (outs),
	(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
	condbrtarget:$dst),
	"bc$at $bo, $bi, $dst">;
	def gBCAat : BForm_3_at<16, 1, 0, (outs),
	(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
	abscondbrtarget:$dst),
	"bca$at $bo, $bi, $dst">;
	} // isAsmParserOnly = 1
	}
	let Defs = [LR, CTR], Uses = [CTR, RM] in {
	def gBCL : BForm_3<16, 0, 1, (outs),
	(ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
	"bcl $bo, $bi, $dst">;
	def gBCLA : BForm_3<16, 1, 1, (outs),
	(ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
	"bcla $bo, $bi, $dst">;
	let isAsmParserOnly = 1 in {
	def gBCLat : BForm_3_at<16, 0, 1, (outs),
	(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
	condbrtarget:$dst),
	"bcl$at $bo, $bi, $dst">;
	def gBCLAat : BForm_3_at<16, 1, 1, (outs),
	(ins u5imm:$bo, atimm:$at, crbitrc:$bi,
	abscondbrtarget:$dst),
	"bcla$at $bo, $bi, $dst">;
	} // // isAsmParserOnly = 1
	}
	let Defs = [CTR], Uses = [CTR, LR, RM] in
	def gBCLR : XLForm_2<19, 16, 0, (outs),
	(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
	"bclr $bo, $bi, $bh", IIC_BrB, []>;
	let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
	def gBCLRL : XLForm_2<19, 16, 1, (outs),
	(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
	"bclrl $bo, $bi, $bh", IIC_BrB, []>;
	let Defs = [CTR], Uses = [CTR, LR, RM] in
	def gBCCTR : XLForm_2<19, 528, 0, (outs),
	(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
	"bcctr $bo, $bi, $bh", IIC_BrB, []>;
	let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
	def gBCCTRL : XLForm_2<19, 528, 1, (outs),
	(ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
	"bcctrl $bo, $bi, $bh", IIC_BrB, []>;
	}

	multiclass BranchSimpleMnemonicAT<string pm, int at> {
	def : InstAlias<"bc"#pm#" $bo, $bi, $dst", (gBCat u5imm:$bo, at, crbitrc:$bi,
	condbrtarget:$dst)>;
	def : InstAlias<"bca"#pm#" $bo, $bi, $dst", (gBCAat u5imm:$bo, at, crbitrc:$bi,
	condbrtarget:$dst)>;
	def : InstAlias<"bcl"#pm#" $bo, $bi, $dst", (gBCLat u5imm:$bo, at, crbitrc:$bi,
	condbrtarget:$dst)>;
	def : InstAlias<"bcla"#pm#" $bo, $bi, $dst", (gBCLAat u5imm:$bo, at, crbitrc:$bi,
	condbrtarget:$dst)>;
	}
	defm : BranchSimpleMnemonicAT<"+", 3>;
	defm : BranchSimpleMnemonicAT<"-", 2>;

	def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
	def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
	def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>;
	def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>;

	multiclass BranchSimpleMnemonic1<string name, string pm, int bo> {
	def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>;
	def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
	def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>;
	def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>;
	def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
	def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>;
	}
	multiclass BranchSimpleMnemonic2<string name, string pm, int bo>
	: BranchSimpleMnemonic1<name, pm, bo> {
	def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>;
	def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>;
	}
	defm : BranchSimpleMnemonic2<"t", "", 12>;
	defm : BranchSimpleMnemonic2<"f", "", 4>;
	defm : BranchSimpleMnemonic2<"t", "-", 14>;
	defm : BranchSimpleMnemonic2<"f", "-", 6>;
	defm : BranchSimpleMnemonic2<"t", "+", 15>;
	defm : BranchSimpleMnemonic2<"f", "+", 7>;
	defm : BranchSimpleMnemonic1<"dnzt", "", 8>;
	defm : BranchSimpleMnemonic1<"dnzf", "", 0>;
	defm : BranchSimpleMnemonic1<"dzt", "", 10>;
	defm : BranchSimpleMnemonic1<"dzf", "", 2>;

	multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
	def : InstAlias<"b"#name#pm#" $cc, $dst",
	(BCC bibo, crrc:$cc, condbrtarget:$dst)>;
	def : InstAlias<"b"#name#pm#" $dst",
	(BCC bibo, CR0, condbrtarget:$dst)>;

	def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
	(BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
	def : InstAlias<"b"#name#"a"#pm#" $dst",
	(BCCA bibo, CR0, abscondbrtarget:$dst)>;

	def : InstAlias<"b"#name#"lr"#pm#" $cc",
	(BCCLR bibo, crrc:$cc)>;
	def : InstAlias<"b"#name#"lr"#pm,
	(BCCLR bibo, CR0)>;

	def : InstAlias<"b"#name#"ctr"#pm#" $cc",
	(BCCCTR bibo, crrc:$cc)>;
	def : InstAlias<"b"#name#"ctr"#pm,
	(BCCCTR bibo, CR0)>;

	def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
	(BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
	def : InstAlias<"b"#name#"l"#pm#" $dst",
	(BCCL bibo, CR0, condbrtarget:$dst)>;

	def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
	(BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
	def : InstAlias<"b"#name#"la"#pm#" $dst",
	(BCCLA bibo, CR0, abscondbrtarget:$dst)>;

	def : InstAlias<"b"#name#"lrl"#pm#" $cc",
	(BCCLRL bibo, crrc:$cc)>;
	def : InstAlias<"b"#name#"lrl"#pm,
	(BCCLRL bibo, CR0)>;

	def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
	(BCCCTRL bibo, crrc:$cc)>;
	def : InstAlias<"b"#name#"ctrl"#pm,
	(BCCCTRL bibo, CR0)>;
	}
	multiclass BranchExtendedMnemonic<string name, int bibo> {
	defm : BranchExtendedMnemonicPM<name, "", bibo>;
	defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>;
	defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>;
	}
	defm : BranchExtendedMnemonic<"lt", 12>;
	defm : BranchExtendedMnemonic<"gt", 44>;
	defm : BranchExtendedMnemonic<"eq", 76>;
	defm : BranchExtendedMnemonic<"un", 108>;
	defm : BranchExtendedMnemonic<"so", 108>;
	defm : BranchExtendedMnemonic<"ge", 4>;
	defm : BranchExtendedMnemonic<"nl", 4>;
	defm : BranchExtendedMnemonic<"le", 36>;
	defm : BranchExtendedMnemonic<"ng", 36>;
	defm : BranchExtendedMnemonic<"ne", 68>;
	defm : BranchExtendedMnemonic<"nu", 100>;
	defm : BranchExtendedMnemonic<"ns", 100>;

	def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
	def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
	def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
	def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
	def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>;
	def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
	def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>;
	def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;

	def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
	def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
	def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
	def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
	def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>;
	def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
	def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
	def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;

	multiclass TrapExtendedMnemonic<string name, int to> {
	def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
	def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
	def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>;
	def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>;
	}
	defm : TrapExtendedMnemonic<"lt", 16>;
	defm : TrapExtendedMnemonic<"le", 20>;
	defm : TrapExtendedMnemonic<"eq", 4>;
	defm : TrapExtendedMnemonic<"ge", 12>;
	defm : TrapExtendedMnemonic<"gt", 8>;
	defm : TrapExtendedMnemonic<"nl", 12>;
	defm : TrapExtendedMnemonic<"ne", 24>;
	defm : TrapExtendedMnemonic<"ng", 20>;
	defm : TrapExtendedMnemonic<"llt", 2>;
	defm : TrapExtendedMnemonic<"lle", 6>;
	defm : TrapExtendedMnemonic<"lge", 5>;
	defm : TrapExtendedMnemonic<"lgt", 1>;
	defm : TrapExtendedMnemonic<"lnl", 5>;
	defm : TrapExtendedMnemonic<"lng", 6>;
	defm : TrapExtendedMnemonic<"u", 31>;

	// Atomic loads
	def : Pat<(atomic_load_8 iaddr:$src), (LBZ memri:$src)>;
	def : Pat<(atomic_load_16 iaddr:$src), (LHZ memri:$src)>;
	def : Pat<(atomic_load_32 iaddr:$src), (LWZ memri:$src)>;
	def : Pat<(atomic_load_8 xaddr:$src), (LBZX memrr:$src)>;
	def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>;
	def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>;

	// Atomic stores
	def : Pat<(atomic_store_8 iaddr:$ptr, i32:$val), (STB gprc:$val, memri:$ptr)>;
	def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH gprc:$val, memri:$ptr)>;
	def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW gprc:$val, memri:$ptr)>;
	def : Pat<(atomic_store_8 xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
	def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
	def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;

	let Predicates = [IsISA3_0] in {

	// Copy-Paste Facility
	// We prefix 'CP' to COPY due to name conflict in Target.td. We also prefix to
	// PASTE for naming consistency.
	let mayLoad = 1 in
	def CP_COPY : X_L1_RA5_RB5<31, 774, "copy" , gprc, IIC_LdStCOPY, []>;

	let mayStore = 1 in
	def CP_PASTE : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>;

	let mayStore = 1, Defs = [CR0] in
	def CP_PASTEo : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isDOT;

	def CP_COPYx : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>;
	def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>;
	def CP_COPY_FIRST : PPCAsmPseudo<"copy_first $rA, $rB",
	(ins gprc:$rA, gprc:$rB)>;
	def CP_PASTE_LAST : PPCAsmPseudo<"paste_last $rA, $rB",
	(ins gprc:$rA, gprc:$rB)>;
	def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cp_abort", IIC_SprABORT, []>;

	// Message Synchronize
	def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;

	// Power-Saving Mode Instruction:
	def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;

	} // IsISA3_0

	// Fast 32-bit reverse bits algorithm:
	// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
	// n = ((n >> 1) & 0x55555555) \| ((n << 1) & 0xAAAAAAAA);
	// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit):
	// n = ((n >> 2) & 0x33333333) \| ((n << 2) & 0xCCCCCCCC);
	// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
	// n = ((n >> 4) & 0x0F0F0F0F) \| ((n << 4) & 0xF0F0F0F0);
	// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4]):
	// Step 4.1: Put B4,B2 in the right position (rotate left 3 bytes):
	// n' = (n rotl 24); After which n' = [B4, B1, B2, B3]
	// Step 4.2: Insert B3 to the right position:
	// n' = rlwimi n', n, 8, 8, 15; After which n' = [B4, B3, B2, B3]
	// Step 4.3: Insert B1 to the right position:
	// n' = rlwimi n', n, 8, 24, 31; After which n' = [B4, B3, B2, B1]
	def MaskValues {
	dag Lo1 = (ORI (LIS 0x5555), 0x5555);
	dag Hi1 = (ORI (LIS 0xAAAA), 0xAAAA);
	dag Lo2 = (ORI (LIS 0x3333), 0x3333);
	dag Hi2 = (ORI (LIS 0xCCCC), 0xCCCC);
	dag Lo4 = (ORI (LIS 0x0F0F), 0x0F0F);
	dag Hi4 = (ORI (LIS 0xF0F0), 0xF0F0);
	}

	def Shift1 {
	dag Right = (RLWINM $A, 31, 1, 31);
	dag Left = (RLWINM $A, 1, 0, 30);
	}

	def Swap1 {
	dag Bit = (OR (AND Shift1.Right, MaskValues.Lo1),
	(AND Shift1.Left, MaskValues.Hi1));
	}

	def Shift2 {
	dag Right = (RLWINM Swap1.Bit, 30, 2, 31);
	dag Left = (RLWINM Swap1.Bit, 2, 0, 29);
	}

	def Swap2 {
	dag Bits = (OR (AND Shift2.Right, MaskValues.Lo2),
	(AND Shift2.Left, MaskValues.Hi2));
	}

	def Shift4 {
	dag Right = (RLWINM Swap2.Bits, 28, 4, 31);
	dag Left = (RLWINM Swap2.Bits, 4, 0, 27);
	}

	def Swap4 {
	dag Bits = (OR (AND Shift4.Right, MaskValues.Lo4),
	(AND Shift4.Left, MaskValues.Hi4));
	}

	def Rotate {
	dag Left3Bytes = (RLWINM Swap4.Bits, 24, 0, 31);
	}

	def RotateInsertByte3 {
	dag Left = (RLWIMI Rotate.Left3Bytes, Swap4.Bits, 8, 8, 15);
	}

	def RotateInsertByte1 {
	dag Left = (RLWIMI RotateInsertByte3.Left, Swap4.Bits, 8, 24, 31);
	}

	def : Pat<(i32 (bitreverse i32:$A)),
	(RLDICL_32 RotateInsertByte1.Left, 0, 32)>;

	// Fast 64-bit reverse bits algorithm:
	// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
	// n = ((n >> 1) & 0x5555555555555555) \| ((n << 1) & 0xAAAAAAAAAAAAAAAA);
	// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit):
	// n = ((n >> 2) & 0x3333333333333333) \| ((n << 2) & 0xCCCCCCCCCCCCCCCC);
	// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
	// n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) \| ((n << 4) & 0xF0F0F0F0F0F0F0F0);
	// Step 4: byte reverse (Suppose n = [B0,B1,B2,B3,B4,B5,B6,B7]):
	// Apply the same byte reverse algorithm mentioned above for the fast 32-bit
	// reverse to both the high 32 bit and low 32 bit of the 64 bit value. And
	// then OR them together to get the final result.
	def MaskValues64 {
	dag Lo1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo1, sub_32));
	dag Hi1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi1, sub_32));
	dag Lo2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo2, sub_32));
	dag Hi2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi2, sub_32));
	dag Lo4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo4, sub_32));
	dag Hi4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi4, sub_32));
	}

	def DWMaskValues {
	dag Lo1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo1, 32, 31), 0x5555), 0x5555);
	dag Hi1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi1, 32, 31), 0xAAAA), 0xAAAA);
	dag Lo2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo2, 32, 31), 0x3333), 0x3333);
	dag Hi2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi2, 32, 31), 0xCCCC), 0xCCCC);
	dag Lo4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo4, 32, 31), 0x0F0F), 0x0F0F);
	dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0);
	}

	def DWSwapInByte {
	dag Swap1 = (OR8 (AND8 (RLDICL $A, 63, 1), DWMaskValues.Lo1),
	(AND8 (RLDICR $A, 1, 62), DWMaskValues.Hi1));
	dag Swap2 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap1, 62, 2), DWMaskValues.Lo2),
	(AND8 (RLDICR DWSwapInByte.Swap1, 2, 61), DWMaskValues.Hi2));
	dag Swap4 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap2, 60, 4), DWMaskValues.Lo4),
	(AND8 (RLDICR DWSwapInByte.Swap2, 4, 59), DWMaskValues.Hi4));
	}

	// Intra-byte swap is done, now start inter-byte swap.
	def DWBytes4567 {
	dag Word = (i32 (EXTRACT_SUBREG DWSwapInByte.Swap4, sub_32));
	}

	def DWBytes7456 {
	dag Word = (RLWINM DWBytes4567.Word, 24, 0, 31);
	}

	def DWBytes7656 {
	dag Word = (RLWIMI DWBytes7456.Word, DWBytes4567.Word, 8, 8, 15);
	}

	// B7 B6 B5 B4 in the right order
	def DWBytes7654 {
	dag Word = (RLWIMI DWBytes7656.Word, DWBytes4567.Word, 8, 24, 31);
	dag DWord =
	(i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes7654.Word, sub_32));
	}

	def DWBytes0123 {
	dag Word = (i32 (EXTRACT_SUBREG (RLDICL DWSwapInByte.Swap4, 32, 32), sub_32));
	}

	def DWBytes3012 {
	dag Word = (RLWINM DWBytes0123.Word, 24, 0, 31);
	}

	def DWBytes3212 {
	dag Word = (RLWIMI DWBytes3012.Word, DWBytes0123.Word, 8, 8, 15);
	}

	// B3 B2 B1 B0 in the right order
	def DWBytes3210 {
	dag Word = (RLWIMI DWBytes3212.Word, DWBytes0123.Word, 8, 24, 31);
	dag DWord =
	(i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes3210.Word, sub_32));
	}

	// Now both high word and low word are reversed, next
	// swap the high word and low word.
	def : Pat<(i64 (bitreverse i64:$A)),
	(OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>;
	Index: vendor/llvm/dist-release_60/lib/Target/X86/AsmParser/X86AsmParser.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/X86/AsmParser/X86AsmParser.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/X86/AsmParser/X86AsmParser.cpp (revision 328362)
	@@ -1,3363 +1,3370 @@
	//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "InstPrinter/X86IntelInstPrinter.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "MCTargetDesc/X86TargetStreamer.h"
	#include "X86AsmInstrumentation.h"
	#include "X86AsmParserCommon.h"
	#include "X86Operand.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCParser/MCAsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmParser.h"
	#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <memory>

	using namespace llvm;

	static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
	if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
	ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
	return true;
	}
	return false;
	}

	namespace {

	static const char OpPrecedence[] = {
	0, // IC_OR
	1, // IC_XOR
	2, // IC_AND
	3, // IC_LSHIFT
	3, // IC_RSHIFT
	4, // IC_PLUS
	4, // IC_MINUS
	5, // IC_MULTIPLY
	5, // IC_DIVIDE
	5, // IC_MOD
	6, // IC_NOT
	7, // IC_NEG
	8, // IC_RPAREN
	9, // IC_LPAREN
	0, // IC_IMM
	0 // IC_REGISTER
	};

	class X86AsmParser : public MCTargetAsmParser {
	ParseInstructionInfo *InstInfo;
	std::unique_ptr<X86AsmInstrumentation> Instrumentation;
	bool Code16GCC;

	private:
	SMLoc consumeToken() {
	MCAsmParser &Parser = getParser();
	SMLoc Result = Parser.getTok().getLoc();
	Parser.Lex();
	return Result;
	}

	X86TargetStreamer &getTargetStreamer() {
	assert(getParser().getStreamer().getTargetStreamer() &&
	"do not have a target streamer");
	MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
	return static_cast<X86TargetStreamer &>(TS);
	}

	unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
	uint64_t &ErrorInfo, bool matchingInlineAsm,
	unsigned VariantID = 0) {
	// In Code16GCC mode, match as 32-bit.
	if (Code16GCC)
	SwitchMode(X86::Mode32Bit);
	unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
	matchingInlineAsm, VariantID);
	if (Code16GCC)
	SwitchMode(X86::Mode16Bit);
	return rv;
	}

	enum InfixCalculatorTok {
	IC_OR = 0,
	IC_XOR,
	IC_AND,
	IC_LSHIFT,
	IC_RSHIFT,
	IC_PLUS,
	IC_MINUS,
	IC_MULTIPLY,
	IC_DIVIDE,
	IC_MOD,
	IC_NOT,
	IC_NEG,
	IC_RPAREN,
	IC_LPAREN,
	IC_IMM,
	IC_REGISTER
	};

	enum IntelOperatorKind {
	IOK_INVALID = 0,
	IOK_LENGTH,
	IOK_SIZE,
	IOK_TYPE,
	IOK_OFFSET
	};

	class InfixCalculator {
	typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
	SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
	SmallVector<ICToken, 4> PostfixStack;

	bool isUnaryOperator(const InfixCalculatorTok Op) {
	return Op == IC_NEG \|\| Op == IC_NOT;
	}

	public:
	int64_t popOperand() {
	assert (!PostfixStack.empty() && "Poped an empty stack!");
	ICToken Op = PostfixStack.pop_back_val();
	if (!(Op.first == IC_IMM \|\| Op.first == IC_REGISTER))
	return -1; // The invalid Scale value will be caught later by checkScale
	return Op.second;
	}
	void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
	assert ((Op == IC_IMM \|\| Op == IC_REGISTER) &&
	"Unexpected operand!");
	PostfixStack.push_back(std::make_pair(Op, Val));
	}

	void popOperator() { InfixOperatorStack.pop_back(); }
	void pushOperator(InfixCalculatorTok Op) {
	// Push the new operator if the stack is empty.
	if (InfixOperatorStack.empty()) {
	InfixOperatorStack.push_back(Op);
	return;
	}

	// Push the new operator if it has a higher precedence than the operator
	// on the top of the stack or the operator on the top of the stack is a
	// left parentheses.
	unsigned Idx = InfixOperatorStack.size() - 1;
	InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
	if (OpPrecedence[Op] > OpPrecedence[StackOp] \|\| StackOp == IC_LPAREN) {
	InfixOperatorStack.push_back(Op);
	return;
	}

	// The operator on the top of the stack has higher precedence than the
	// new operator.
	unsigned ParenCount = 0;
	while (1) {
	// Nothing to process.
	if (InfixOperatorStack.empty())
	break;

	Idx = InfixOperatorStack.size() - 1;
	StackOp = InfixOperatorStack[Idx];
	if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] \|\| ParenCount))
	break;

	// If we have an even parentheses count and we see a left parentheses,
	// then stop processing.
	if (!ParenCount && StackOp == IC_LPAREN)
	break;

	if (StackOp == IC_RPAREN) {
	++ParenCount;
	InfixOperatorStack.pop_back();
	} else if (StackOp == IC_LPAREN) {
	--ParenCount;
	InfixOperatorStack.pop_back();
	} else {
	InfixOperatorStack.pop_back();
	PostfixStack.push_back(std::make_pair(StackOp, 0));
	}
	}
	// Push the new operator.
	InfixOperatorStack.push_back(Op);
	}

	int64_t execute() {
	// Push any remaining operators onto the postfix stack.
	while (!InfixOperatorStack.empty()) {
	InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
	if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
	PostfixStack.push_back(std::make_pair(StackOp, 0));
	}

	if (PostfixStack.empty())
	return 0;

	SmallVector<ICToken, 16> OperandStack;
	for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
	ICToken Op = PostfixStack[i];
	if (Op.first == IC_IMM \|\| Op.first == IC_REGISTER) {
	OperandStack.push_back(Op);
	} else if (isUnaryOperator(Op.first)) {
	assert (OperandStack.size() > 0 && "Too few operands.");
	ICToken Operand = OperandStack.pop_back_val();
	assert (Operand.first == IC_IMM &&
	"Unary operation with a register!");
	switch (Op.first) {
	default:
	report_fatal_error("Unexpected operator!");
	break;
	case IC_NEG:
	OperandStack.push_back(std::make_pair(IC_IMM, -Operand.second));
	break;
	case IC_NOT:
	OperandStack.push_back(std::make_pair(IC_IMM, ~Operand.second));
	break;
	}
	} else {
	assert (OperandStack.size() > 1 && "Too few operands.");
	int64_t Val;
	ICToken Op2 = OperandStack.pop_back_val();
	ICToken Op1 = OperandStack.pop_back_val();
	switch (Op.first) {
	default:
	report_fatal_error("Unexpected operator!");
	break;
	case IC_PLUS:
	Val = Op1.second + Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_MINUS:
	Val = Op1.second - Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_MULTIPLY:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Multiply operation with an immediate and a register!");
	Val = Op1.second * Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_DIVIDE:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Divide operation with an immediate and a register!");
	assert (Op2.second != 0 && "Division by zero!");
	Val = Op1.second / Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_MOD:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Modulo operation with an immediate and a register!");
	Val = Op1.second % Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_OR:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Or operation with an immediate and a register!");
	Val = Op1.second \| Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_XOR:
	assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Xor operation with an immediate and a register!");
	Val = Op1.second ^ Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_AND:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"And operation with an immediate and a register!");
	Val = Op1.second & Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_LSHIFT:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Left shift operation with an immediate and a register!");
	Val = Op1.second << Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	case IC_RSHIFT:
	assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
	"Right shift operation with an immediate and a register!");
	Val = Op1.second >> Op2.second;
	OperandStack.push_back(std::make_pair(IC_IMM, Val));
	break;
	}
	}
	}
	assert (OperandStack.size() == 1 && "Expected a single result.");
	return OperandStack.pop_back_val().second;
	}
	};

	enum IntelExprState {
	IES_INIT,
	IES_OR,
	IES_XOR,
	IES_AND,
	IES_LSHIFT,
	IES_RSHIFT,
	IES_PLUS,
	IES_MINUS,
	IES_NOT,
	IES_MULTIPLY,
	IES_DIVIDE,
	IES_MOD,
	IES_LBRAC,
	IES_RBRAC,
	IES_LPAREN,
	IES_RPAREN,
	IES_REGISTER,
	IES_INTEGER,
	IES_IDENTIFIER,
	IES_ERROR
	};

	class IntelExprStateMachine {
	IntelExprState State, PrevState;
	unsigned BaseReg, IndexReg, TmpReg, Scale;
	int64_t Imm;
	const MCExpr *Sym;
	StringRef SymName;
	InfixCalculator IC;
	InlineAsmIdentifierInfo Info;
	short BracCount;
	bool MemExpr;

	public:
	IntelExprStateMachine()
	: State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
	TmpReg(0), Scale(1), Imm(0), Sym(nullptr), BracCount(0),
	MemExpr(false) {}

	void addImm(int64_t imm) { Imm += imm; }
	short getBracCount() { return BracCount; }
	bool isMemExpr() { return MemExpr; }
	unsigned getBaseReg() { return BaseReg; }
	unsigned getIndexReg() { return IndexReg; }
	unsigned getScale() { return Scale; }
	const MCExpr *getSym() { return Sym; }
	StringRef getSymName() { return SymName; }
	int64_t getImm() { return Imm + IC.execute(); }
	bool isValidEndState() {
	return State == IES_RBRAC \|\| State == IES_INTEGER;
	}
	bool hadError() { return State == IES_ERROR; }
	InlineAsmIdentifierInfo &getIdentifierInfo() { return Info; }

	void onOr() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	case IES_REGISTER:
	State = IES_OR;
	IC.pushOperator(IC_OR);
	break;
	}
	PrevState = CurrState;
	}
	void onXor() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	case IES_REGISTER:
	State = IES_XOR;
	IC.pushOperator(IC_XOR);
	break;
	}
	PrevState = CurrState;
	}
	void onAnd() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	case IES_REGISTER:
	State = IES_AND;
	IC.pushOperator(IC_AND);
	break;
	}
	PrevState = CurrState;
	}
	void onLShift() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	case IES_REGISTER:
	State = IES_LSHIFT;
	IC.pushOperator(IC_LSHIFT);
	break;
	}
	PrevState = CurrState;
	}
	void onRShift() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	case IES_REGISTER:
	State = IES_RSHIFT;
	IC.pushOperator(IC_RSHIFT);
	break;
	}
	PrevState = CurrState;
	}
	bool onPlus(StringRef &ErrMsg) {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	case IES_REGISTER:
	State = IES_PLUS;
	IC.pushOperator(IC_PLUS);
	if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
	// If we already have a BaseReg, then assume this is the IndexReg with
	// a scale of 1.
	if (!BaseReg) {
	BaseReg = TmpReg;
	} else {
	if (IndexReg) {
	ErrMsg = "BaseReg/IndexReg already set!";
	return true;
	}
	IndexReg = TmpReg;
	Scale = 1;
	}
	}
	break;
	}
	PrevState = CurrState;
	return false;
	}
	bool onMinus(StringRef &ErrMsg) {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_OR:
	case IES_XOR:
	case IES_AND:
	case IES_LSHIFT:
	case IES_RSHIFT:
	case IES_PLUS:
	case IES_NOT:
	case IES_MULTIPLY:
	case IES_DIVIDE:
	case IES_MOD:
	case IES_LPAREN:
	case IES_RPAREN:
	case IES_LBRAC:
	case IES_RBRAC:
	case IES_INTEGER:
	case IES_REGISTER:
	case IES_INIT:
	State = IES_MINUS;
	// push minus operator if it is not a negate operator
	if (CurrState == IES_REGISTER \|\| CurrState == IES_RPAREN \|\|
	CurrState == IES_INTEGER \|\| CurrState == IES_RBRAC)
	IC.pushOperator(IC_MINUS);
	else if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
	// We have negate operator for Scale: it's illegal
	ErrMsg = "Scale can't be negative";
	return true;
	} else
	IC.pushOperator(IC_NEG);
	if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
	// If we already have a BaseReg, then assume this is the IndexReg with
	// a scale of 1.
	if (!BaseReg) {
	BaseReg = TmpReg;
	} else {
	if (IndexReg) {
	ErrMsg = "BaseReg/IndexReg already set!";
	return true;
	}
	IndexReg = TmpReg;
	Scale = 1;
	}
	}
	break;
	}
	PrevState = CurrState;
	return false;
	}
	void onNot() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_OR:
	case IES_XOR:
	case IES_AND:
	case IES_LSHIFT:
	case IES_RSHIFT:
	case IES_PLUS:
	case IES_MINUS:
	case IES_NOT:
	case IES_MULTIPLY:
	case IES_DIVIDE:
	case IES_MOD:
	case IES_LPAREN:
	case IES_LBRAC:
	case IES_INIT:
	State = IES_NOT;
	IC.pushOperator(IC_NOT);
	break;
	}
	PrevState = CurrState;
	}

	bool onRegister(unsigned Reg, StringRef &ErrMsg) {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_PLUS:
	case IES_LPAREN:
	case IES_LBRAC:
	State = IES_REGISTER;
	TmpReg = Reg;
	IC.pushOperand(IC_REGISTER);
	break;
	case IES_MULTIPLY:
	// Index Register - Scale * Register
	if (PrevState == IES_INTEGER) {
	if (IndexReg) {
	ErrMsg = "BaseReg/IndexReg already set!";
	return true;
	}
	State = IES_REGISTER;
	IndexReg = Reg;
	// Get the scale and replace the 'Scale * Register' with '0'.
	Scale = IC.popOperand();
	if (checkScale(Scale, ErrMsg))
	return true;
	IC.pushOperand(IC_IMM);
	IC.popOperator();
	} else {
	State = IES_ERROR;
	}
	break;
	}
	PrevState = CurrState;
	return false;
	}
	bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
	const InlineAsmIdentifierInfo &IDInfo,
	bool ParsingInlineAsm, StringRef &ErrMsg) {
	// InlineAsm: Treat an enum value as an integer
	if (ParsingInlineAsm)
	if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
	return onInteger(IDInfo.Enum.EnumVal, ErrMsg);
	// Treat a symbolic constant like an integer
	if (auto *CE = dyn_cast<MCConstantExpr>(SymRef))
	return onInteger(CE->getValue(), ErrMsg);
	PrevState = State;
	bool HasSymbol = Sym != nullptr;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_PLUS:
	case IES_MINUS:
	case IES_NOT:
	case IES_INIT:
	case IES_LBRAC:
	MemExpr = true;
	State = IES_INTEGER;
	Sym = SymRef;
	SymName = SymRefName;
	IC.pushOperand(IC_IMM);
	if (ParsingInlineAsm)
	Info = IDInfo;
	break;
	}
	if (HasSymbol)
	ErrMsg = "cannot use more than one symbol in memory operand";
	return HasSymbol;
	}
	bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_PLUS:
	case IES_MINUS:
	case IES_NOT:
	case IES_OR:
	case IES_XOR:
	case IES_AND:
	case IES_LSHIFT:
	case IES_RSHIFT:
	case IES_DIVIDE:
	case IES_MOD:
	case IES_MULTIPLY:
	case IES_LPAREN:
	case IES_INIT:
	case IES_LBRAC:
	State = IES_INTEGER;
	if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
	// Index Register - Register * Scale
	if (IndexReg) {
	ErrMsg = "BaseReg/IndexReg already set!";
	return true;
	}
	IndexReg = TmpReg;
	Scale = TmpInt;
	if (checkScale(Scale, ErrMsg))
	return true;
	// Get the scale and replace the 'Register * Scale' with '0'.
	IC.popOperator();
	} else {
	IC.pushOperand(IC_IMM, TmpInt);
	}
	break;
	}
	PrevState = CurrState;
	return false;
	}
	void onStar() {
	PrevState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_REGISTER:
	case IES_RPAREN:
	State = IES_MULTIPLY;
	IC.pushOperator(IC_MULTIPLY);
	break;
	}
	}
	void onDivide() {
	PrevState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	State = IES_DIVIDE;
	IC.pushOperator(IC_DIVIDE);
	break;
	}
	}
	void onMod() {
	PrevState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_RPAREN:
	State = IES_MOD;
	IC.pushOperator(IC_MOD);
	break;
	}
	}
	bool onLBrac() {
	if (BracCount)
	return true;
	PrevState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_RBRAC:
	case IES_INTEGER:
	case IES_RPAREN:
	State = IES_PLUS;
	IC.pushOperator(IC_PLUS);
	break;
	case IES_INIT:
	assert(!BracCount && "BracCount should be zero on parsing's start");
	State = IES_LBRAC;
	break;
	}
	MemExpr = true;
	BracCount++;
	return false;
	}
	bool onRBrac() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_REGISTER:
	case IES_RPAREN:
	if (BracCount-- != 1)
	return true;
	State = IES_RBRAC;
	if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
	// If we already have a BaseReg, then assume this is the IndexReg with
	// a scale of 1.
	if (!BaseReg) {
	BaseReg = TmpReg;
	} else {
	assert (!IndexReg && "BaseReg/IndexReg already set!");
	IndexReg = TmpReg;
	Scale = 1;
	}
	}
	break;
	}
	PrevState = CurrState;
	return false;
	}
	void onLParen() {
	IntelExprState CurrState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_PLUS:
	case IES_MINUS:
	case IES_NOT:
	case IES_OR:
	case IES_XOR:
	case IES_AND:
	case IES_LSHIFT:
	case IES_RSHIFT:
	case IES_MULTIPLY:
	case IES_DIVIDE:
	case IES_MOD:
	case IES_LPAREN:
	case IES_INIT:
	case IES_LBRAC:
	State = IES_LPAREN;
	IC.pushOperator(IC_LPAREN);
	break;
	}
	PrevState = CurrState;
	}
	void onRParen() {
	PrevState = State;
	switch (State) {
	default:
	State = IES_ERROR;
	break;
	case IES_INTEGER:
	case IES_REGISTER:
	case IES_RPAREN:
	State = IES_RPAREN;
	IC.pushOperator(IC_RPAREN);
	break;
	}
	}
	};

	bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
	bool MatchingInlineAsm = false) {
	MCAsmParser &Parser = getParser();
	if (MatchingInlineAsm) {
	if (!getLexer().isAtStartOfStatement())
	Parser.eatToEndOfStatement();
	return false;
	}
	return Parser.Error(L, Msg, Range);
	}

	std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
	Error(Loc, Msg);
	return nullptr;
	}

	std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
	std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
	bool IsSIReg(unsigned Reg);
	unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
	void
	AddDefaultSrcDestOperands(OperandVector &Operands,
	std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
	std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
	bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
	OperandVector &FinalOperands);
	std::unique_ptr<X86Operand> ParseOperand();
	std::unique_ptr<X86Operand> ParseATTOperand();
	std::unique_ptr<X86Operand> ParseIntelOperand();
	std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
	bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
	unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
	unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
	std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
	bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
	void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
	SMLoc End);
	bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
	bool ParseIntelInlineAsmIdentifier(const MCExpr *&Val, StringRef &Identifier,
	InlineAsmIdentifierInfo &Info,
	bool IsUnevaluatedOperand, SMLoc &End);

	std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);

	bool ParseIntelMemoryOperandSize(unsigned &Size);
	std::unique_ptr<X86Operand>
	CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
	unsigned IndexReg, unsigned Scale, SMLoc Start,
	SMLoc End, unsigned Size, StringRef Identifier,
	const InlineAsmIdentifierInfo &Info);

	bool parseDirectiveEven(SMLoc L);
	bool ParseDirectiveWord(unsigned Size, SMLoc L);
	bool ParseDirectiveCode(StringRef IDVal, SMLoc L);

	/// CodeView FPO data directives.
	bool parseDirectiveFPOProc(SMLoc L);
	bool parseDirectiveFPOSetFrame(SMLoc L);
	bool parseDirectiveFPOPushReg(SMLoc L);
	bool parseDirectiveFPOStackAlloc(SMLoc L);
	bool parseDirectiveFPOEndPrologue(SMLoc L);
	bool parseDirectiveFPOEndProc(SMLoc L);
	bool parseDirectiveFPOData(SMLoc L);

	bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
	bool processInstruction(MCInst &Inst, const OperandVector &Ops);

	/// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
	/// instrumentation around Inst.
	void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);

	bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands, MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) override;

	void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
	MCStreamer &Out, bool MatchingInlineAsm);

	bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
	bool MatchingInlineAsm);

	bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands, MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm);

	bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands, MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm);

	bool OmitRegisterFromClobberLists(unsigned RegNo) override;

	/// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
	/// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
	/// return false if no parsing errors occurred, true otherwise.
	bool HandleAVX512Operand(OperandVector &Operands,
	const MCParsedAsmOperand &Op);

	bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);

	bool is64BitMode() const {
	// FIXME: Can tablegen auto-generate this?
	return getSTI().getFeatureBits()[X86::Mode64Bit];
	}
	bool is32BitMode() const {
	// FIXME: Can tablegen auto-generate this?
	return getSTI().getFeatureBits()[X86::Mode32Bit];
	}
	bool is16BitMode() const {
	// FIXME: Can tablegen auto-generate this?
	return getSTI().getFeatureBits()[X86::Mode16Bit];
	}
	void SwitchMode(unsigned mode) {
	MCSubtargetInfo &STI = copySTI();
	FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
	FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
	uint64_t FB = ComputeAvailableFeatures(
	STI.ToggleFeature(OldMode.flip(mode)));
	setAvailableFeatures(FB);

	assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
	}

	unsigned getPointerWidth() {
	if (is16BitMode()) return 16;
	if (is32BitMode()) return 32;
	if (is64BitMode()) return 64;
	llvm_unreachable("invalid mode");
	}

	bool isParsingIntelSyntax() {
	return getParser().getAssemblerDialect();
	}

	/// @name Auto-generated Matcher Functions
	/// {

	#define GET_ASSEMBLER_HEADER
	#include "X86GenAsmMatcher.inc"

	/// }

	public:

	X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
	const MCInstrInfo &mii, const MCTargetOptions &Options)
	: MCTargetAsmParser(Options, sti, mii), InstInfo(nullptr),
	Code16GCC(false) {

	// Initialize the set of available features.
	setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
	Instrumentation.reset(
	CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
	}

	bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;

	void SetFrameRegister(unsigned RegNo) override;

	bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
	SMLoc NameLoc, OperandVector &Operands) override;

	bool ParseDirective(AsmToken DirectiveID) override;
	};
	} // end anonymous namespace

	/// @name Auto-generated Match Functions
	/// {

	static unsigned MatchRegisterName(StringRef Name);

	/// }

	static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
	unsigned Scale, StringRef &ErrMsg) {
	// If we have both a base register and an index register make sure they are
	// both 64-bit or 32-bit registers.
	// To support VSIB, IndexReg can be 128-bit or 256-bit registers.

	if ((BaseReg == X86::RIP && IndexReg != 0) \|\| (IndexReg == X86::RIP)) {
	ErrMsg = "invalid base+index expression";
	return true;
	}
	if (BaseReg != 0 && IndexReg != 0) {
	if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
	(X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) \|\|
	X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
	IndexReg != X86::RIZ) {
	ErrMsg = "base register is 64-bit, but index register is not";
	return true;
	}
	if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
	(X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) \|\|
	X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
	IndexReg != X86::EIZ){
	ErrMsg = "base register is 32-bit, but index register is not";
	return true;
	}
	if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
	if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) \|\|
	X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
	ErrMsg = "base register is 16-bit, but index register is not";
	return true;
	}
	if (((BaseReg == X86::BX \|\| BaseReg == X86::BP) &&
	IndexReg != X86::SI && IndexReg != X86::DI) \|\|
	((BaseReg == X86::SI \|\| BaseReg == X86::DI) &&
	IndexReg != X86::BX && IndexReg != X86::BP)) {
	ErrMsg = "invalid 16-bit base/index register combination";
	return true;
	}
	}
	}
	return checkScale(Scale, ErrMsg);
	}

	bool X86AsmParser::ParseRegister(unsigned &RegNo,
	SMLoc &StartLoc, SMLoc &EndLoc) {
	MCAsmParser &Parser = getParser();
	RegNo = 0;
	const AsmToken &PercentTok = Parser.getTok();
	StartLoc = PercentTok.getLoc();

	// If we encounter a %, ignore it. This code handles registers with and
	// without the prefix, unprefixed registers can occur in cfi directives.
	if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
	Parser.Lex(); // Eat percent token.

	const AsmToken &Tok = Parser.getTok();
	EndLoc = Tok.getEndLoc();

	if (Tok.isNot(AsmToken::Identifier)) {
	if (isParsingIntelSyntax()) return true;
	return Error(StartLoc, "invalid register name",
	SMRange(StartLoc, EndLoc));
	}

	RegNo = MatchRegisterName(Tok.getString());

	// If the match failed, try the register name as lowercase.
	if (RegNo == 0)
	RegNo = MatchRegisterName(Tok.getString().lower());

	// The "flags" register cannot be referenced directly.
	// Treat it as an identifier instead.
	if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
	RegNo = 0;

	if (!is64BitMode()) {
	// FIXME: This should be done using Requires<Not64BitMode> and
	// Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
	// checked.
	// FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
	// REX prefix.
	if (RegNo == X86::RIZ \|\|
	X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) \|\|
	X86II::isX86_64NonExtLowByteReg(RegNo) \|\|
	X86II::isX86_64ExtendedReg(RegNo))
	return Error(StartLoc, "register %"
	+ Tok.getString() + " is only available in 64-bit mode",
	SMRange(StartLoc, EndLoc));
	} else if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) {
	if (X86II::is32ExtendedReg(RegNo))
	return Error(StartLoc, "register %"
	+ Tok.getString() + " is only available with AVX512",
	SMRange(StartLoc, EndLoc));
	}

	// Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
	if (RegNo == 0 && (Tok.getString() == "st" \|\| Tok.getString() == "ST")) {
	RegNo = X86::ST0;
	Parser.Lex(); // Eat 'st'

	// Check to see if we have '(4)' after %st.
	if (getLexer().isNot(AsmToken::LParen))
	return false;
	// Lex the paren.
	getParser().Lex();

	const AsmToken &IntTok = Parser.getTok();
	if (IntTok.isNot(AsmToken::Integer))
	return Error(IntTok.getLoc(), "expected stack index");
	switch (IntTok.getIntVal()) {
	case 0: RegNo = X86::ST0; break;
	case 1: RegNo = X86::ST1; break;
	case 2: RegNo = X86::ST2; break;
	case 3: RegNo = X86::ST3; break;
	case 4: RegNo = X86::ST4; break;
	case 5: RegNo = X86::ST5; break;
	case 6: RegNo = X86::ST6; break;
	case 7: RegNo = X86::ST7; break;
	default: return Error(IntTok.getLoc(), "invalid stack index");
	}

	if (getParser().Lex().isNot(AsmToken::RParen))
	return Error(Parser.getTok().getLoc(), "expected ')'");

	EndLoc = Parser.getTok().getEndLoc();
	Parser.Lex(); // Eat ')'
	return false;
	}

	EndLoc = Parser.getTok().getEndLoc();

	// If this is "db[0-15]", match it as an alias
	// for dr[0-15].
	if (RegNo == 0 && Tok.getString().startswith("db")) {
	if (Tok.getString().size() == 3) {
	switch (Tok.getString()[2]) {
	case '0': RegNo = X86::DR0; break;
	case '1': RegNo = X86::DR1; break;
	case '2': RegNo = X86::DR2; break;
	case '3': RegNo = X86::DR3; break;
	case '4': RegNo = X86::DR4; break;
	case '5': RegNo = X86::DR5; break;
	case '6': RegNo = X86::DR6; break;
	case '7': RegNo = X86::DR7; break;
	case '8': RegNo = X86::DR8; break;
	case '9': RegNo = X86::DR9; break;
	}
	} else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') {
	switch (Tok.getString()[3]) {
	case '0': RegNo = X86::DR10; break;
	case '1': RegNo = X86::DR11; break;
	case '2': RegNo = X86::DR12; break;
	case '3': RegNo = X86::DR13; break;
	case '4': RegNo = X86::DR14; break;
	case '5': RegNo = X86::DR15; break;
	}
	}

	if (RegNo != 0) {
	EndLoc = Parser.getTok().getEndLoc();
	Parser.Lex(); // Eat it.
	return false;
	}
	}

	if (RegNo == 0) {
	if (isParsingIntelSyntax()) return true;
	return Error(StartLoc, "invalid register name",
	SMRange(StartLoc, EndLoc));
	}

	Parser.Lex(); // Eat identifier token.
	return false;
	}

	void X86AsmParser::SetFrameRegister(unsigned RegNo) {
	Instrumentation->SetInitialFrameRegister(RegNo);
	}

	std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
	bool Parse32 = is32BitMode() \|\| Code16GCC;
	unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
	const MCExpr *Disp = MCConstantExpr::create(0, getContext());
	return X86Operand::CreateMem(getPointerWidth(), /SegReg=/0, Disp,
	/BaseReg=/Basereg, /IndexReg=/0, /Scale=/1,
	Loc, Loc, 0);
	}

	std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
	bool Parse32 = is32BitMode() \|\| Code16GCC;
	unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
	const MCExpr *Disp = MCConstantExpr::create(0, getContext());
	return X86Operand::CreateMem(getPointerWidth(), /SegReg=/0, Disp,
	/BaseReg=/Basereg, /IndexReg=/0, /Scale=/1,
	Loc, Loc, 0);
	}

	bool X86AsmParser::IsSIReg(unsigned Reg) {
	switch (Reg) {
	default: llvm_unreachable("Only (R\|E)SI and (R\|E)DI are expected!");
	case X86::RSI:
	case X86::ESI:
	case X86::SI:
	return true;
	case X86::RDI:
	case X86::EDI:
	case X86::DI:
	return false;
	}
	}

	unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
	bool IsSIReg) {
	switch (RegClassID) {
	default: llvm_unreachable("Unexpected register class");
	case X86::GR64RegClassID:
	return IsSIReg ? X86::RSI : X86::RDI;
	case X86::GR32RegClassID:
	return IsSIReg ? X86::ESI : X86::EDI;
	case X86::GR16RegClassID:
	return IsSIReg ? X86::SI : X86::DI;
	}
	}

	void X86AsmParser::AddDefaultSrcDestOperands(
	OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
	std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
	if (isParsingIntelSyntax()) {
	Operands.push_back(std::move(Dst));
	Operands.push_back(std::move(Src));
	}
	else {
	Operands.push_back(std::move(Src));
	Operands.push_back(std::move(Dst));
	}
	}

	bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
	OperandVector &FinalOperands) {

	if (OrigOperands.size() > 1) {
	// Check if sizes match, OrigOperands also contains the instruction name
	assert(OrigOperands.size() == FinalOperands.size() + 1 &&
	"Operand size mismatch");

	SmallVector<std::pair<SMLoc, std::string>, 2> Warnings;
	// Verify types match
	int RegClassID = -1;
	for (unsigned int i = 0; i < FinalOperands.size(); ++i) {
	X86Operand &OrigOp = static_cast<X86Operand &>(*OrigOperands[i + 1]);
	X86Operand &FinalOp = static_cast<X86Operand &>(*FinalOperands[i]);

	if (FinalOp.isReg() &&
	(!OrigOp.isReg() \|\| FinalOp.getReg() != OrigOp.getReg()))
	// Return false and let a normal complaint about bogus operands happen
	return false;

	if (FinalOp.isMem()) {

	if (!OrigOp.isMem())
	// Return false and let a normal complaint about bogus operands happen
	return false;

	unsigned OrigReg = OrigOp.Mem.BaseReg;
	unsigned FinalReg = FinalOp.Mem.BaseReg;

	// If we've already encounterd a register class, make sure all register
	// bases are of the same register class
	if (RegClassID != -1 &&
	!X86MCRegisterClasses[RegClassID].contains(OrigReg)) {
	return Error(OrigOp.getStartLoc(),
	"mismatching source and destination index registers");
	}

	if (X86MCRegisterClasses[X86::GR64RegClassID].contains(OrigReg))
	RegClassID = X86::GR64RegClassID;
	else if (X86MCRegisterClasses[X86::GR32RegClassID].contains(OrigReg))
	RegClassID = X86::GR32RegClassID;
	else if (X86MCRegisterClasses[X86::GR16RegClassID].contains(OrigReg))
	RegClassID = X86::GR16RegClassID;
	else
	// Unexpected register class type
	// Return false and let a normal complaint about bogus operands happen
	return false;

	bool IsSI = IsSIReg(FinalReg);
	FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);

	if (FinalReg != OrigReg) {
	std::string RegName = IsSI ? "ES:(R\|E)SI" : "ES:(R\|E)DI";
	Warnings.push_back(std::make_pair(
	OrigOp.getStartLoc(),
	"memory operand is only for determining the size, " + RegName +
	" will be used for the location"));
	}

	FinalOp.Mem.Size = OrigOp.Mem.Size;
	FinalOp.Mem.SegReg = OrigOp.Mem.SegReg;
	FinalOp.Mem.BaseReg = FinalReg;
	}
	}

	// Produce warnings only if all the operands passed the adjustment - prevent
	// legal cases like "movsd (%rax), %xmm0" mistakenly produce warnings
	for (auto &WarningMsg : Warnings) {
	Warning(WarningMsg.first, WarningMsg.second);
	}

	// Remove old operands
	for (unsigned int i = 0; i < FinalOperands.size(); ++i)
	OrigOperands.pop_back();
	}
	// OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
	for (unsigned int i = 0; i < FinalOperands.size(); ++i)
	OrigOperands.push_back(std::move(FinalOperands[i]));

	return false;
	}

	std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
	if (isParsingIntelSyntax())
	return ParseIntelOperand();
	return ParseATTOperand();
	}

	std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
	unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
	unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
	const InlineAsmIdentifierInfo &Info) {
	// If we found a decl other than a VarDecl, then assume it is a FuncDecl or
	// some other label reference.
	if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
	// Insert an explicit size if the user didn't have one.
	if (!Size) {
	Size = getPointerWidth();
	InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
	/Len=/0, Size);
	}
	// Create an absolute memory reference in order to match against
	// instructions taking a PC relative operand.
	return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
	Identifier, Info.Label.Decl);
	}
	// We either have a direct symbol reference, or an offset from a symbol. The
	// parser always puts the symbol on the LHS, so look there for size
	// calculation purposes.
	unsigned FrontendSize = 0;
	void *Decl = nullptr;
	bool IsGlobalLV = false;
	if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
	// Size is in terms of bits in this context.
	FrontendSize = Info.Var.Type * 8;
	Decl = Info.Var.Decl;
	IsGlobalLV = Info.Var.IsGlobalLV;
	}
	// It is widely common for MS InlineAsm to use a global variable and one/two
	// registers in a mmory expression, and though unaccessible via rip/eip.
	if (IsGlobalLV && (BaseReg \|\| IndexReg)) {
	return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End);
	// Otherwise, we set the base register to a non-zero value
	// if we don't know the actual value at this time. This is necessary to
	// get the matching correct in some cases.
	} else {
	BaseReg = BaseReg ? BaseReg : 1;
	return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
	IndexReg, Scale, Start, End, Size, Identifier,
	Decl, FrontendSize);
	}
	}

	// Some binary bitwise operators have a named synonymous
	// Query a candidate string for being such a named operator
	// and if so - invoke the appropriate handler
	bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) {
	// A named operator should be either lower or upper case, but not a mix
	if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
	return false;
	if (Name.equals_lower("not"))
	SM.onNot();
	else if (Name.equals_lower("or"))
	SM.onOr();
	else if (Name.equals_lower("shl"))
	SM.onLShift();
	else if (Name.equals_lower("shr"))
	SM.onRShift();
	else if (Name.equals_lower("xor"))
	SM.onXor();
	else if (Name.equals_lower("and"))
	SM.onAnd();
	else if (Name.equals_lower("mod"))
	SM.onMod();
	else
	return false;
	return true;
	}

	bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	StringRef ErrMsg;

	AsmToken::TokenKind PrevTK = AsmToken::Error;
	bool Done = false;
	while (!Done) {
	bool UpdateLocLex = true;
	AsmToken::TokenKind TK = getLexer().getKind();

	switch (TK) {
	default:
	if ((Done = SM.isValidEndState()))
	break;
	return Error(Tok.getLoc(), "unknown token in expression");
	case AsmToken::EndOfStatement:
	Done = true;
	break;
	case AsmToken::Real:
	// DotOperator: [ebx].0
	UpdateLocLex = false;
	if (ParseIntelDotOperator(SM, End))
	return true;
	break;
	case AsmToken::String:
	case AsmToken::Identifier: {
	SMLoc IdentLoc = Tok.getLoc();
	StringRef Identifier = Tok.getString();
	UpdateLocLex = false;
	// Register
	unsigned Reg;
	if (Tok.isNot(AsmToken::String) && !ParseRegister(Reg, IdentLoc, End)) {
	if (SM.onRegister(Reg, ErrMsg))
	return Error(Tok.getLoc(), ErrMsg);
	break;
	}
	// Operator synonymous ("not", "or" etc.)
	if ((UpdateLocLex = ParseIntelNamedOperator(Identifier, SM)))
	break;
	// Symbol reference, when parsing assembly content
	InlineAsmIdentifierInfo Info;
	const MCExpr *Val;
	if (!isParsingInlineAsm()) {
	if (getParser().parsePrimaryExpr(Val, End)) {
	return Error(Tok.getLoc(), "Unexpected identifier!");
	} else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
	return Error(IdentLoc, ErrMsg);
	} else
	break;
	}
	// MS InlineAsm operators (TYPE/LENGTH/SIZE)
	if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
	if (OpKind == IOK_OFFSET)
	return Error(IdentLoc, "Dealing OFFSET operator as part of"
	"a compound immediate expression is yet to be supported");
	if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
	if (SM.onInteger(Val, ErrMsg))
	return Error(IdentLoc, ErrMsg);
	} else
	return true;
	break;
	}
	// MS Dot Operator expression
	if (Identifier.count('.') && PrevTK == AsmToken::RBrac) {
	if (ParseIntelDotOperator(SM, End))
	return true;
	break;
	}
	// MS InlineAsm identifier
	if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
	return true;
	else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
	return Error(IdentLoc, ErrMsg);
	break;
	}
	case AsmToken::Integer: {
	// Look for 'b' or 'f' following an Integer as a directional label
	SMLoc Loc = getTok().getLoc();
	int64_t IntVal = getTok().getIntVal();
	End = consumeToken();
	UpdateLocLex = false;
	if (getLexer().getKind() == AsmToken::Identifier) {
	StringRef IDVal = getTok().getString();
	if (IDVal == "f" \|\| IDVal == "b") {
	MCSymbol *Sym =
	getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
	const MCExpr *Val =
	MCSymbolRefExpr::create(Sym, Variant, getContext());
	if (IDVal == "b" && Sym->isUndefined())
	return Error(Loc, "invalid reference to undefined symbol");
	StringRef Identifier = Sym->getName();
	InlineAsmIdentifierInfo Info;
	if (SM.onIdentifierExpr(Val, Identifier, Info,
	isParsingInlineAsm(), ErrMsg))
	return Error(Loc, ErrMsg);
	End = consumeToken();
	} else {
	if (SM.onInteger(IntVal, ErrMsg))
	return Error(Loc, ErrMsg);
	}
	} else {
	if (SM.onInteger(IntVal, ErrMsg))
	return Error(Loc, ErrMsg);
	}
	break;
	}
	case AsmToken::Plus:
	if (SM.onPlus(ErrMsg))
	return Error(getTok().getLoc(), ErrMsg);
	break;
	case AsmToken::Minus:
	if (SM.onMinus(ErrMsg))
	return Error(getTok().getLoc(), ErrMsg);
	break;
	case AsmToken::Tilde: SM.onNot(); break;
	case AsmToken::Star: SM.onStar(); break;
	case AsmToken::Slash: SM.onDivide(); break;
	case AsmToken::Percent: SM.onMod(); break;
	case AsmToken::Pipe: SM.onOr(); break;
	case AsmToken::Caret: SM.onXor(); break;
	case AsmToken::Amp: SM.onAnd(); break;
	case AsmToken::LessLess:
	SM.onLShift(); break;
	case AsmToken::GreaterGreater:
	SM.onRShift(); break;
	case AsmToken::LBrac:
	if (SM.onLBrac())
	return Error(Tok.getLoc(), "unexpected bracket encountered");
	break;
	case AsmToken::RBrac:
	if (SM.onRBrac())
	return Error(Tok.getLoc(), "unexpected bracket encountered");
	break;
	case AsmToken::LParen: SM.onLParen(); break;
	case AsmToken::RParen: SM.onRParen(); break;
	}
	if (SM.hadError())
	return Error(Tok.getLoc(), "unknown token in expression");

	if (!Done && UpdateLocLex)
	End = consumeToken();

	PrevTK = TK;
	}
	return false;
	}

	void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM,
	SMLoc Start, SMLoc End) {
	SMLoc Loc = Start;
	unsigned ExprLen = End.getPointer() - Start.getPointer();
	// Skip everything before a symbol displacement (if we have one)
	if (SM.getSym()) {
	StringRef SymName = SM.getSymName();
	if (unsigned Len = SymName.data() - Start.getPointer())
	InstInfo->AsmRewrites->emplace_back(AOK_Skip, Start, Len);
	Loc = SMLoc::getFromPointer(SymName.data() + SymName.size());
	ExprLen = End.getPointer() - (SymName.data() + SymName.size());
	// If we have only a symbol than there's no need for complex rewrite,
	// simply skip everything after it
	if (!(SM.getBaseReg() \|\| SM.getIndexReg() \|\| SM.getImm())) {
	if (ExprLen)
	InstInfo->AsmRewrites->emplace_back(AOK_Skip, Loc, ExprLen);
	return;
	}
	}
	// Build an Intel Expression rewrite
	StringRef BaseRegStr;
	StringRef IndexRegStr;
	if (SM.getBaseReg())
	BaseRegStr = X86IntelInstPrinter::getRegisterName(SM.getBaseReg());
	if (SM.getIndexReg())
	IndexRegStr = X86IntelInstPrinter::getRegisterName(SM.getIndexReg());
	// Emit it
	IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), SM.getImm(), SM.isMemExpr());
	InstInfo->AsmRewrites->emplace_back(Loc, ExprLen, Expr);
	}

	// Inline assembly may use variable names with namespace alias qualifiers.
	bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val,
	StringRef &Identifier,
	InlineAsmIdentifierInfo &Info,
	bool IsUnevaluatedOperand,
	SMLoc &End) {
	MCAsmParser &Parser = getParser();
	assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
	Val = nullptr;

	StringRef LineBuf(Identifier.data());
	SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);

	const AsmToken &Tok = Parser.getTok();
	SMLoc Loc = Tok.getLoc();

	// Advance the token stream until the end of the current token is
	// after the end of what the frontend claimed.
	const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
	do {
	End = Tok.getEndLoc();
	getLexer().Lex();
	} while (End.getPointer() < EndPtr);
	Identifier = LineBuf;

	// The frontend should end parsing on an assembler token boundary, unless it
	// failed parsing.
	assert((End.getPointer() == EndPtr \|\|
	Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) &&
	"frontend claimed part of a token?");

	// If the identifier lookup was unsuccessful, assume that we are dealing with
	// a label.
	if (Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) {
	StringRef InternalName =
	SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
	Loc, false);
	assert(InternalName.size() && "We should have an internal name here.");
	// Push a rewrite for replacing the identifier name with the internal name.
	InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
	InternalName);
	} else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
	return false;
	// Create the symbol reference.
	MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
	Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
	return false;
	}

	//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
	std::unique_ptr<X86Operand>
	X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	// Eat "{" and mark the current place.
	const SMLoc consumedToken = consumeToken();
	if (Tok.getIdentifier().startswith("r")){
	int rndMode = StringSwitch<int>(Tok.getIdentifier())
	.Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
	.Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF)
	.Case("ru", X86::STATIC_ROUNDING::TO_POS_INF)
	.Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
	.Default(-1);
	if (-1 == rndMode)
	return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
	Parser.Lex(); // Eat "r" of r-sae
	if (!getLexer().is(AsmToken::Minus))
	return ErrorOperand(Tok.getLoc(), "Expected - at this point");
	Parser.Lex(); // Eat "-"
	Parser.Lex(); // Eat the sae
	if (!getLexer().is(AsmToken::RCurly))
	return ErrorOperand(Tok.getLoc(), "Expected } at this point");
	Parser.Lex(); // Eat "}"
	const MCExpr *RndModeOp =
	MCConstantExpr::create(rndMode, Parser.getContext());
	return X86Operand::CreateImm(RndModeOp, Start, End);
	}
	if(Tok.getIdentifier().equals("sae")){
	Parser.Lex(); // Eat the sae
	if (!getLexer().is(AsmToken::RCurly))
	return ErrorOperand(Tok.getLoc(), "Expected } at this point");
	Parser.Lex(); // Eat "}"
	return X86Operand::CreateToken("{sae}", consumedToken);
	}
	return ErrorOperand(Tok.getLoc(), "unknown token in expression");
	}

	/// Parse the '.' operator.
	bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) {
	const AsmToken &Tok = getTok();
	unsigned Offset;

	// Drop the optional '.'.
	StringRef DotDispStr = Tok.getString();
	if (DotDispStr.startswith("."))
	DotDispStr = DotDispStr.drop_front(1);

	// .Imm gets lexed as a real.
	if (Tok.is(AsmToken::Real)) {
	APInt DotDisp;
	DotDispStr.getAsInteger(10, DotDisp);
	Offset = DotDisp.getZExtValue();
	} else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
	std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
	if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
	Offset))
	return Error(Tok.getLoc(), "Unable to lookup field reference!");
	} else
	return Error(Tok.getLoc(), "Unexpected token type!");

	// Eat the DotExpression and update End
	End = SMLoc::getFromPointer(DotDispStr.data());
	const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size();
	while (Tok.getLoc().getPointer() < DotExprEndLoc)
	Lex();
	SM.addImm(Offset);
	return false;
	}

	/// Parse the 'offset' operator. This operator is used to specify the
	/// location rather then the content of a variable.
	std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	SMLoc OffsetOfLoc = Tok.getLoc();
	Parser.Lex(); // Eat offset.

	const MCExpr *Val;
	InlineAsmIdentifierInfo Info;
	SMLoc Start = Tok.getLoc(), End;
	StringRef Identifier = Tok.getString();
	if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
	/Unevaluated=/false, End))
	return nullptr;

	void *Decl = nullptr;
	// FIXME: MS evaluates "offset <Constant>" to the underlying integral
	if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
	return ErrorOperand(Start, "offset operator cannot yet handle constants");
	else if (Info.isKind(InlineAsmIdentifierInfo::IK_Var))
	Decl = Info.Var.Decl;
	// Don't emit the offset operator.
	InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);

	// The offset operator will have an 'r' constraint, thus we need to create
	// register operand to ensure proper matching. Just pick a GPR based on
	// the size of a pointer.
	bool Parse32 = is32BitMode() \|\| Code16GCC;
	unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX);

	return X86Operand::CreateReg(RegNo, Start, End, /GetAddress=/true,
	OffsetOfLoc, Identifier, Decl);
	}

	// Query a candidate string for being an Intel assembly operator
	// Report back its kind, or IOK_INVALID if does not evaluated as a known one
	unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
	return StringSwitch<unsigned>(Name)
	.Cases("TYPE","type",IOK_TYPE)
	.Cases("SIZE","size",IOK_SIZE)
	.Cases("LENGTH","length",IOK_LENGTH)
	.Cases("OFFSET","offset",IOK_OFFSET)
	.Default(IOK_INVALID);
	}

	/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
	/// returns the number of elements in an array. It returns the value 1 for
	/// non-array variables. The SIZE operator returns the size of a C or C++
	/// variable. A variable's size is the product of its LENGTH and TYPE. The
	/// TYPE operator returns the size of a C or C++ type or variable. If the
	/// variable is an array, TYPE returns the size of a single element.
	unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	Parser.Lex(); // Eat operator.

	const MCExpr *Val = nullptr;
	InlineAsmIdentifierInfo Info;
	SMLoc Start = Tok.getLoc(), End;
	StringRef Identifier = Tok.getString();
	if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
	/Unevaluated=/true, End))
	return 0;

	if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
	Error(Start, "unable to lookup expression");
	return 0;
	}

	unsigned CVal = 0;
	switch(OpKind) {
	default: llvm_unreachable("Unexpected operand kind!");
	case IOK_LENGTH: CVal = Info.Var.Length; break;
	case IOK_SIZE: CVal = Info.Var.Size; break;
	case IOK_TYPE: CVal = Info.Var.Type; break;
	}

	return CVal;
	}

	bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
	Size = StringSwitch<unsigned>(getTok().getString())
	.Cases("BYTE", "byte", 8)
	.Cases("WORD", "word", 16)
	.Cases("DWORD", "dword", 32)
	.Cases("FLOAT", "float", 32)
	.Cases("LONG", "long", 32)
	.Cases("FWORD", "fword", 48)
	.Cases("DOUBLE", "double", 64)
	.Cases("QWORD", "qword", 64)
	.Cases("MMWORD","mmword", 64)
	.Cases("XWORD", "xword", 80)
	.Cases("TBYTE", "tbyte", 80)
	.Cases("XMMWORD", "xmmword", 128)
	.Cases("YMMWORD", "ymmword", 256)
	.Cases("ZMMWORD", "zmmword", 512)
	.Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
	.Default(0);
	if (Size) {
	const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
	if (!(Tok.getString().equals("PTR") \|\| Tok.getString().equals("ptr")))
	return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
	Lex(); // Eat ptr.
	}
	return false;
	}

	std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	SMLoc Start, End;

	// FIXME: Offset operator
	// Should be handled as part of immediate expression, as other operators
	// Currently, only supported as a stand-alone operand
	if (isParsingInlineAsm())
	if (IdentifyIntelInlineAsmOperator(Tok.getString()) == IOK_OFFSET)
	return ParseIntelOffsetOfOperator();

	// Parse optional Size directive.
	unsigned Size;
	if (ParseIntelMemoryOperandSize(Size))
	return nullptr;
	bool PtrInOperand = bool(Size);

	Start = Tok.getLoc();

	// Rounding mode operand.
	if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
	getLexer().is(AsmToken::LCurly))
	return ParseRoundingModeOp(Start, End);

	// Register operand.
	unsigned RegNo = 0;
	if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) {
	if (RegNo == X86::RIP)
	return ErrorOperand(Start, "rip can only be used as a base register");
	// A Register followed by ':' is considered a segment override
	if (Tok.isNot(AsmToken::Colon))
	return !PtrInOperand ? X86Operand::CreateReg(RegNo, Start, End) :
	ErrorOperand(Start, "expected memory operand after 'ptr', "
	"found register operand instead");
	// An alleged segment override. check if we have a valid segment register
	if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
	return ErrorOperand(Start, "invalid segment register");
	// Eat ':' and update Start location
	Start = Lex().getLoc();
	}

	// Immediates and Memory
	IntelExprStateMachine SM;
	if (ParseIntelExpression(SM, End))
	return nullptr;

	if (isParsingInlineAsm())
	RewriteIntelExpression(SM, Start, Tok.getLoc());

	int64_t Imm = SM.getImm();
	const MCExpr *Disp = SM.getSym();
	const MCExpr *ImmDisp = MCConstantExpr::create(Imm, getContext());
	if (Disp && Imm)
	Disp = MCBinaryExpr::createAdd(Disp, ImmDisp, getContext());
	if (!Disp)
	Disp = ImmDisp;

	// RegNo != 0 specifies a valid segment register,
	// and we are parsing a segment override
	if (!SM.isMemExpr() && !RegNo)
	return X86Operand::CreateImm(Disp, Start, End);

	StringRef ErrMsg;
	unsigned BaseReg = SM.getBaseReg();
	unsigned IndexReg = SM.getIndexReg();
	unsigned Scale = SM.getScale();

	if ((BaseReg \|\| IndexReg) &&
	CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg))
	return ErrorOperand(Start, ErrMsg);
	if (isParsingInlineAsm())
	return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg,
	Scale, Start, End, Size, SM.getSymName(),
	SM.getIdentifierInfo());
	if (!(BaseReg \|\| IndexReg \|\| RegNo))
	return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
	return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
	BaseReg, IndexReg, Scale, Start, End, Size);
	}

	std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
	MCAsmParser &Parser = getParser();
	switch (getLexer().getKind()) {
	default:
	// Parse a memory operand with no segment register.
	return ParseMemOperand(0, Parser.getTok().getLoc());
	case AsmToken::Percent: {
	// Read the register.
	unsigned RegNo;
	SMLoc Start, End;
	if (ParseRegister(RegNo, Start, End)) return nullptr;
	if (RegNo == X86::EIZ \|\| RegNo == X86::RIZ) {
	Error(Start, "%eiz and %riz can only be used as index registers",
	SMRange(Start, End));
	return nullptr;
	}
	if (RegNo == X86::RIP) {
	Error(Start, "%rip can only be used as a base register",
	SMRange(Start, End));
	return nullptr;
	}

	// If this is a segment register followed by a ':', then this is the start
	// of a memory reference, otherwise this is a normal register reference.
	if (getLexer().isNot(AsmToken::Colon))
	return X86Operand::CreateReg(RegNo, Start, End);

	if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
	return ErrorOperand(Start, "invalid segment register");

	getParser().Lex(); // Eat the colon.
	return ParseMemOperand(RegNo, Start);
	}
	case AsmToken::Dollar: {
	// $42 -> immediate.
	SMLoc Start = Parser.getTok().getLoc(), End;
	Parser.Lex();
	const MCExpr *Val;
	if (getParser().parseExpression(Val, End))
	return nullptr;
	return X86Operand::CreateImm(Val, Start, End);
	}
	case AsmToken::LCurly:{
	SMLoc Start = Parser.getTok().getLoc(), End;
	if (getSTI().getFeatureBits()[X86::FeatureAVX512])
	return ParseRoundingModeOp(Start, End);
	return ErrorOperand(Start, "Unexpected '{' in expression");
	}
	}
	}

	// true on failure, false otherwise
	// If no {z} mark was found - Parser doesn't advance
	bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
	const SMLoc &StartLoc) {
	MCAsmParser &Parser = getParser();
	// Assuming we are just pass the '{' mark, quering the next token
	// Searched for {z}, but none was found. Return false, as no parsing error was
	// encountered
	if (!(getLexer().is(AsmToken::Identifier) &&
	(getLexer().getTok().getIdentifier() == "z")))
	return false;
	Parser.Lex(); // Eat z
	// Query and eat the '}' mark
	if (!getLexer().is(AsmToken::RCurly))
	return Error(getLexer().getLoc(), "Expected } at this point");
	Parser.Lex(); // Eat '}'
	// Assign Z with the {z} mark opernad
	Z = X86Operand::CreateToken("{z}", StartLoc);
	return false;
	}

	// true on failure, false otherwise
	bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
	const MCParsedAsmOperand &Op) {
	MCAsmParser &Parser = getParser();
	if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
	if (getLexer().is(AsmToken::LCurly)) {
	// Eat "{" and mark the current place.
	const SMLoc consumedToken = consumeToken();
	// Distinguish {1to<NUM>} from {%k<NUM>}.
	if(getLexer().is(AsmToken::Integer)) {
	// Parse memory broadcasting ({1to<NUM>}).
	if (getLexer().getTok().getIntVal() != 1)
	return TokError("Expected 1to<NUM> at this point");
	Parser.Lex(); // Eat "1" of 1to8
	if (!getLexer().is(AsmToken::Identifier) \|\|
	!getLexer().getTok().getIdentifier().startswith("to"))
	return TokError("Expected 1to<NUM> at this point");
	// Recognize only reasonable suffixes.
	const char *BroadcastPrimitive =
	StringSwitch<const char*>(getLexer().getTok().getIdentifier())
	.Case("to2", "{1to2}")
	.Case("to4", "{1to4}")
	.Case("to8", "{1to8}")
	.Case("to16", "{1to16}")
	.Default(nullptr);
	if (!BroadcastPrimitive)
	return TokError("Invalid memory broadcast primitive.");
	Parser.Lex(); // Eat "toN" of 1toN
	if (!getLexer().is(AsmToken::RCurly))
	return TokError("Expected } at this point");
	Parser.Lex(); // Eat "}"
	Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
	consumedToken));
	// No AVX512 specific primitives can pass
	// after memory broadcasting, so return.
	return false;
	} else {
	// Parse either {k}{z}, {z}{k}, {k} or {z}
	// last one have no meaning, but GCC accepts it
	// Currently, we're just pass a '{' mark
	std::unique_ptr<X86Operand> Z;
	if (ParseZ(Z, consumedToken))
	return true;
	// Reaching here means that parsing of the allegadly '{z}' mark yielded
	// no errors.
	// Query for the need of further parsing for a {%k<NUM>} mark
	if (!Z \|\| getLexer().is(AsmToken::LCurly)) {
	SMLoc StartLoc = Z ? consumeToken() : consumedToken;
	// Parse an op-mask register mark ({%k<NUM>}), which is now to be
	// expected
	unsigned RegNo;
	SMLoc RegLoc;
	if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
	X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
	if (RegNo == X86::K0)
	return Error(RegLoc, "Register k0 can't be used as write mask");
	if (!getLexer().is(AsmToken::RCurly))
	return Error(getLexer().getLoc(), "Expected } at this point");
	Operands.push_back(X86Operand::CreateToken("{", StartLoc));
	Operands.push_back(
	X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
	Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
	} else
	return Error(getLexer().getLoc(),
	"Expected an op-mask register at this point");
	// {%k<NUM>} mark is found, inquire for {z}
	if (getLexer().is(AsmToken::LCurly) && !Z) {
	// Have we've found a parsing error, or found no (expected) {z} mark
	// - report an error
	if (ParseZ(Z, consumeToken()) \|\| !Z)
	return Error(getLexer().getLoc(),
	"Expected a {z} mark at this point");

	}
	// '{z}' on its own is meaningless, hence should be ignored.
	// on the contrary - have it been accompanied by a K register,
	// allow it.
	if (Z)
	Operands.push_back(std::move(Z));
	}
	}
	}
	}
	return false;
	}

	/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
	/// has already been parsed if present.
	std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
	SMLoc MemStart) {

	MCAsmParser &Parser = getParser();
	// We have to disambiguate a parenthesized expression "(4+5)" from the start
	// of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
	// only way to do this without lookahead is to eat the '(' and see what is
	// after it.
	const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext());
	if (getLexer().isNot(AsmToken::LParen)) {
	SMLoc ExprEnd;
	if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;

	// After parsing the base expression we could either have a parenthesized
	// memory address or not. If not, return now. If so, eat the (.
	if (getLexer().isNot(AsmToken::LParen)) {
	// Unless we have a segment register, treat this as an immediate.
	if (SegReg == 0)
	return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd);
	return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
	MemStart, ExprEnd);
	}

	// Eat the '('.
	Parser.Lex();
	} else {
	// Okay, we have a '('. We don't know if this is an expression or not, but
	// so we have to eat the ( to see beyond it.
	SMLoc LParenLoc = Parser.getTok().getLoc();
	Parser.Lex(); // Eat the '('.

	if (getLexer().is(AsmToken::Percent) \|\| getLexer().is(AsmToken::Comma)) {
	// Nothing to do here, fall into the code below with the '(' part of the
	// memory operand consumed.
	} else {
	SMLoc ExprEnd;
	getLexer().UnLex(AsmToken(AsmToken::LParen, "("));

	// It must be either an parenthesized expression, or an expression that
	// begins from a parenthesized expression, parse it now. Example: (1+2) or
	// (1+2)+3
	if (getParser().parseExpression(Disp, ExprEnd))
	return nullptr;

	// After parsing the base expression we could either have a parenthesized
	// memory address or not. If not, return now. If so, eat the (.
	if (getLexer().isNot(AsmToken::LParen)) {
	// Unless we have a segment register, treat this as an immediate.
	if (SegReg == 0)
	return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc,
	ExprEnd);
	return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
	MemStart, ExprEnd);
	}

	// Eat the '('.
	Parser.Lex();
	}
	}

	// If we reached here, then we just ate the ( of the memory operand. Process
	// the rest of the memory operand.
	unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
	SMLoc IndexLoc, BaseLoc;

	if (getLexer().is(AsmToken::Percent)) {
	SMLoc StartLoc, EndLoc;
	BaseLoc = Parser.getTok().getLoc();
	if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
	if (BaseReg == X86::EIZ \|\| BaseReg == X86::RIZ) {
	Error(StartLoc, "eiz and riz can only be used as index registers",
	SMRange(StartLoc, EndLoc));
	return nullptr;
	}
	}

	if (getLexer().is(AsmToken::Comma)) {
	Parser.Lex(); // Eat the comma.
	IndexLoc = Parser.getTok().getLoc();

	// Following the comma we should have either an index register, or a scale
	// value. We don't support the later form, but we want to parse it
	// correctly.
	//
	// Not that even though it would be completely consistent to support syntax
	// like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
	if (getLexer().is(AsmToken::Percent)) {
	SMLoc L;
	if (ParseRegister(IndexReg, L, L))
	return nullptr;
	if (BaseReg == X86::RIP) {
	Error(IndexLoc, "%rip as base register can not have an index register");
	return nullptr;
	}
	if (IndexReg == X86::RIP) {
	Error(IndexLoc, "%rip is not allowed as an index register");
	return nullptr;
	}

	if (getLexer().isNot(AsmToken::RParen)) {
	// Parse the scale amount:
	// ::= ',' [scale-expression]
	if (getLexer().isNot(AsmToken::Comma)) {
	Error(Parser.getTok().getLoc(),
	"expected comma in scale expression");
	return nullptr;
	}
	Parser.Lex(); // Eat the comma.

	if (getLexer().isNot(AsmToken::RParen)) {
	SMLoc Loc = Parser.getTok().getLoc();

	int64_t ScaleVal;
	if (getParser().parseAbsoluteExpression(ScaleVal)){
	Error(Loc, "expected scale expression");
	return nullptr;
	}

	// Validate the scale amount.
	if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
	ScaleVal != 1) {
	Error(Loc, "scale factor in 16-bit address must be 1");
	return nullptr;
	}
	if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
	ScaleVal != 8) {
	Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
	return nullptr;
	}
	Scale = (unsigned)ScaleVal;
	}
	}
	} else if (getLexer().isNot(AsmToken::RParen)) {
	// A scale amount without an index is ignored.
	// index.
	SMLoc Loc = Parser.getTok().getLoc();

	int64_t Value;
	if (getParser().parseAbsoluteExpression(Value))
	return nullptr;

	if (Value != 1)
	Warning(Loc, "scale factor without index register is ignored");
	Scale = 1;
	}
	}

	// Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
	if (getLexer().isNot(AsmToken::RParen)) {
	Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
	return nullptr;
	}
	SMLoc MemEnd = Parser.getTok().getEndLoc();
	Parser.Lex(); // Eat the ')'.

	// Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
	// and then only in non-64-bit modes. Except for DX, which is a special case
	// because an unofficial form of in/out instructions uses it.
	if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
	(is64BitMode() \|\| (BaseReg != X86::BX && BaseReg != X86::BP &&
	BaseReg != X86::SI && BaseReg != X86::DI)) &&
	BaseReg != X86::DX) {
	Error(BaseLoc, "invalid 16-bit base register");
	return nullptr;
	}
	if (BaseReg == 0 &&
	X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
	Error(IndexLoc, "16-bit memory operand may not include only index register");
	return nullptr;
	}

	StringRef ErrMsg;
	if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg)) {
	Error(BaseLoc, ErrMsg);
	return nullptr;
	}

	if (SegReg \|\| BaseReg \|\| IndexReg)
	return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
	IndexReg, Scale, MemStart, MemEnd);
	return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
	}

	bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
	SMLoc NameLoc, OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	InstInfo = &Info;
	StringRef PatchedName = Name;

	if ((Name.equals("jmp") \|\| Name.equals("jc") \|\| Name.equals("jz")) &&
	isParsingIntelSyntax() && isParsingInlineAsm()) {
	StringRef NextTok = Parser.getTok().getString();
	if (NextTok == "short") {
	SMLoc NameEndLoc =
	NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
	// Eat the short keyword
	Parser.Lex();
	// MS ignores the short keyword, it determines the jmp type based
	// on the distance of the label
	InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
	NextTok.size() + 1);
	}
	}

	// FIXME: Hack to recognize setneb as setne.
	if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
	PatchedName != "setb" && PatchedName != "setnb")
	PatchedName = PatchedName.substr(0, Name.size()-1);

	// FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
	if ((PatchedName.startswith("cmp") \|\| PatchedName.startswith("vcmp")) &&
	(PatchedName.endswith("ss") \|\| PatchedName.endswith("sd") \|\|
	PatchedName.endswith("ps") \|\| PatchedName.endswith("pd"))) {
	bool IsVCMP = PatchedName[0] == 'v';
	unsigned CCIdx = IsVCMP ? 4 : 3;
	unsigned ComparisonCode = StringSwitch<unsigned>(
	PatchedName.slice(CCIdx, PatchedName.size() - 2))
	.Case("eq", 0x00)
	.Case("eq_oq", 0x00)
	.Case("lt", 0x01)
	.Case("lt_os", 0x01)
	.Case("le", 0x02)
	.Case("le_os", 0x02)
	.Case("unord", 0x03)
	.Case("unord_q", 0x03)
	.Case("neq", 0x04)
	.Case("neq_uq", 0x04)
	.Case("nlt", 0x05)
	.Case("nlt_us", 0x05)
	.Case("nle", 0x06)
	.Case("nle_us", 0x06)
	.Case("ord", 0x07)
	.Case("ord_q", 0x07)
	/* AVX only from here */
	.Case("eq_uq", 0x08)
	.Case("nge", 0x09)
	.Case("nge_us", 0x09)
	.Case("ngt", 0x0A)
	.Case("ngt_us", 0x0A)
	.Case("false", 0x0B)
	.Case("false_oq", 0x0B)
	.Case("neq_oq", 0x0C)
	.Case("ge", 0x0D)
	.Case("ge_os", 0x0D)
	.Case("gt", 0x0E)
	.Case("gt_os", 0x0E)
	.Case("true", 0x0F)
	.Case("true_uq", 0x0F)
	.Case("eq_os", 0x10)
	.Case("lt_oq", 0x11)
	.Case("le_oq", 0x12)
	.Case("unord_s", 0x13)
	.Case("neq_us", 0x14)
	.Case("nlt_uq", 0x15)
	.Case("nle_uq", 0x16)
	.Case("ord_s", 0x17)
	.Case("eq_us", 0x18)
	.Case("nge_uq", 0x19)
	.Case("ngt_uq", 0x1A)
	.Case("false_os", 0x1B)
	.Case("neq_os", 0x1C)
	.Case("ge_oq", 0x1D)
	.Case("gt_oq", 0x1E)
	.Case("true_us", 0x1F)
	.Default(~0U);
	if (ComparisonCode != ~0U && (IsVCMP \|\| ComparisonCode < 8)) {

	Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
	NameLoc));

	const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
	getParser().getContext());
	Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));

	PatchedName = PatchedName.substr(PatchedName.size() - 2);
	}
	}

	// FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
	if (PatchedName.startswith("vpcmp") &&
	(PatchedName.endswith("b") \|\| PatchedName.endswith("w") \|\|
	PatchedName.endswith("d") \|\| PatchedName.endswith("q"))) {
	unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
	unsigned ComparisonCode = StringSwitch<unsigned>(
	PatchedName.slice(5, PatchedName.size() - CCIdx))
	.Case("eq", 0x0) // Only allowed on unsigned. Checked below.
	.Case("lt", 0x1)
	.Case("le", 0x2)
	//.Case("false", 0x3) // Not a documented alias.
	.Case("neq", 0x4)
	.Case("nlt", 0x5)
	.Case("nle", 0x6)
	//.Case("true", 0x7) // Not a documented alias.
	.Default(~0U);
	if (ComparisonCode != ~0U && (ComparisonCode != 0 \|\| CCIdx == 2)) {
	Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));

	const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
	getParser().getContext());
	Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));

	PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
	}
	}

	// FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
	if (PatchedName.startswith("vpcom") &&
	(PatchedName.endswith("b") \|\| PatchedName.endswith("w") \|\|
	PatchedName.endswith("d") \|\| PatchedName.endswith("q"))) {
	unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
	unsigned ComparisonCode = StringSwitch<unsigned>(
	PatchedName.slice(5, PatchedName.size() - CCIdx))
	.Case("lt", 0x0)
	.Case("le", 0x1)
	.Case("gt", 0x2)
	.Case("ge", 0x3)
	.Case("eq", 0x4)
	.Case("neq", 0x5)
	.Case("false", 0x6)
	.Case("true", 0x7)
	.Default(~0U);
	if (ComparisonCode != ~0U) {
	Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));

	const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
	getParser().getContext());
	Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));

	PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
	}
	}


	// Determine whether this is an instruction prefix.
	// FIXME:
	// Enhance prefixes integrity robustness. for example, following forms
	// are currently tolerated:
	// repz repnz <insn> ; GAS errors for the use of two similar prefixes
	// lock addq %rax, %rbx ; Destination operand must be of memory type
	// xacquire <insn> ; xacquire must be accompanied by 'lock'
	bool isPrefix = StringSwitch<bool>(Name)
	.Cases("rex64", "data32", "data16", true)
	.Cases("xacquire", "xrelease", true)
	.Cases("acquire", "release", isParsingIntelSyntax())
	.Default(false);

	auto isLockRepeatPrefix = [](StringRef N) {
	return StringSwitch<bool>(N)
	.Cases("lock", "rep", "repe", "repz", "repne", "repnz", true)
	.Default(false);
	};

	bool CurlyAsEndOfStatement = false;

	unsigned Flags = X86::IP_NO_PREFIX;
	while (isLockRepeatPrefix(Name.lower())) {
	unsigned Prefix =
	StringSwitch<unsigned>(Name)
	.Cases("lock", "lock", X86::IP_HAS_LOCK)
	.Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
	.Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
	.Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
	Flags \|= Prefix;
	+ if (getLexer().is(AsmToken::EndOfStatement)) {
	+ // We don't have real instr with the given prefix
	+ // let's use the prefix as the instr.
	+ // TODO: there could be several prefixes one after another
	+ Flags = X86::IP_NO_PREFIX;
	+ break;
	+ }
	Name = Parser.getTok().getString();
	Parser.Lex(); // eat the prefix
	// Hack: we could have something like "rep # some comment" or
	// "lock; cmpxchg16b $1" or "lock\0A\09incl" or "lock/incl"
	while (Name.startswith(";") \|\| Name.startswith("\n") \|\|
	Name.startswith("#") \|\| Name.startswith("\t") \|\|
	Name.startswith("/")) {
	Name = Parser.getTok().getString();
	Parser.Lex(); // go to next prefix or instr
	}
	}

	if (Flags)
	PatchedName = Name;
	Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));

	// This does the actual operand parsing. Don't parse any more if we have a
	// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
	// just want to parse the "lock" as the first instruction and the "incl" as
	// the next one.
	if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
	// Parse '*' modifier.
	if (getLexer().is(AsmToken::Star))
	Operands.push_back(X86Operand::CreateToken("*", consumeToken()));

	// Read the operands.
	while(1) {
	if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
	Operands.push_back(std::move(Op));
	if (HandleAVX512Operand(Operands, *Operands.back()))
	return true;
	} else {
	return true;
	}
	// check for comma and eat it
	if (getLexer().is(AsmToken::Comma))
	Parser.Lex();
	else
	break;
	}

	// In MS inline asm curly braces mark the beginning/end of a block,
	// therefore they should be interepreted as end of statement
	CurlyAsEndOfStatement =
	isParsingIntelSyntax() && isParsingInlineAsm() &&
	(getLexer().is(AsmToken::LCurly) \|\| getLexer().is(AsmToken::RCurly));
	if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
	return TokError("unexpected token in argument list");
	}

	// Consume the EndOfStatement or the prefix separator Slash
	if (getLexer().is(AsmToken::EndOfStatement) \|\|
	(isPrefix && getLexer().is(AsmToken::Slash)))
	Parser.Lex();
	else if (CurlyAsEndOfStatement)
	// Add an actual EndOfStatement before the curly brace
	Info.AsmRewrites->emplace_back(AOK_EndOfStatement,
	getLexer().getTok().getLoc(), 0);

	// This is for gas compatibility and cannot be done in td.
	// Adding "p" for some floating point with no argument.
	// For example: fsub --> fsubp
	bool IsFp =
	Name == "fsub" \|\| Name == "fdiv" \|\| Name == "fsubr" \|\| Name == "fdivr";
	if (IsFp && Operands.size() == 1) {
	const char Repl = StringSwitch<const char >(Name)
	.Case("fsub", "fsubp")
	.Case("fdiv", "fdivp")
	.Case("fsubr", "fsubrp")
	.Case("fdivr", "fdivrp");
	static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
	}

	// Moving a 32 or 16 bit value into a segment register has the same
	// behavior. Modify such instructions to always take shorter form.
	if ((Name == "mov" \|\| Name == "movw" \|\| Name == "movl") &&
	(Operands.size() == 3)) {
	X86Operand &Op1 = (X86Operand &)*Operands[1];
	X86Operand &Op2 = (X86Operand &)*Operands[2];
	SMLoc Loc = Op1.getEndLoc();
	if (Op1.isReg() && Op2.isReg() &&
	X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
	Op2.getReg()) &&
	(X86MCRegisterClasses[X86::GR16RegClassID].contains(Op1.getReg()) \|\|
	X86MCRegisterClasses[X86::GR32RegClassID].contains(Op1.getReg()))) {
	// Change instruction name to match new instruction.
	if (Name != "mov" && Name[3] == (is16BitMode() ? 'l' : 'w')) {
	Name = is16BitMode() ? "movw" : "movl";
	Operands[0] = X86Operand::CreateToken(Name, NameLoc);
	}
	// Select the correct equivalent 16-/32-bit source register.
	unsigned Reg =
	getX86SubSuperRegisterOrZero(Op1.getReg(), is16BitMode() ? 16 : 32);
	Operands[1] = X86Operand::CreateReg(Reg, Loc, Loc);
	}
	}

	// This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
	// "outb %al, %dx". Out doesn't take a memory form, but this is a widely
	// documented form in various unofficial manuals, so a lot of code uses it.
	if ((Name == "outb" \|\| Name == "outsb" \|\| Name == "outw" \|\| Name == "outsw" \|\|
	Name == "outl" \|\| Name == "outsl" \|\| Name == "out" \|\| Name == "outs") &&
	Operands.size() == 3) {
	X86Operand &Op = (X86Operand &)*Operands.back();
	if (Op.isMem() && Op.Mem.SegReg == 0 &&
	isa<MCConstantExpr>(Op.Mem.Disp) &&
	cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
	Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
	SMLoc Loc = Op.getEndLoc();
	Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
	}
	}
	// Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
	if ((Name == "inb" \|\| Name == "insb" \|\| Name == "inw" \|\| Name == "insw" \|\|
	Name == "inl" \|\| Name == "insl" \|\| Name == "in" \|\| Name == "ins") &&
	Operands.size() == 3) {
	X86Operand &Op = (X86Operand &)*Operands[1];
	if (Op.isMem() && Op.Mem.SegReg == 0 &&
	isa<MCConstantExpr>(Op.Mem.Disp) &&
	cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
	Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
	SMLoc Loc = Op.getEndLoc();
	Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
	}
	}

	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
	bool HadVerifyError = false;

	// Append default arguments to "ins[bwld]"
	if (Name.startswith("ins") &&
	(Operands.size() == 1 \|\| Operands.size() == 3) &&
	(Name == "insb" \|\| Name == "insw" \|\| Name == "insl" \|\| Name == "insd" \|\|
	Name == "ins")) {

	AddDefaultSrcDestOperands(TmpOperands,
	X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
	DefaultMemDIOperand(NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Append default arguments to "outs[bwld]"
	if (Name.startswith("outs") &&
	(Operands.size() == 1 \|\| Operands.size() == 3) &&
	(Name == "outsb" \|\| Name == "outsw" \|\| Name == "outsl" \|\|
	Name == "outsd" \|\| Name == "outs")) {
	AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
	X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
	// values of $SIREG according to the mode. It would be nice if this
	// could be achieved with InstAlias in the tables.
	if (Name.startswith("lods") &&
	(Operands.size() == 1 \|\| Operands.size() == 2) &&
	(Name == "lods" \|\| Name == "lodsb" \|\| Name == "lodsw" \|\|
	Name == "lodsl" \|\| Name == "lodsd" \|\| Name == "lodsq")) {
	TmpOperands.push_back(DefaultMemSIOperand(NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
	// values of $DIREG according to the mode. It would be nice if this
	// could be achieved with InstAlias in the tables.
	if (Name.startswith("stos") &&
	(Operands.size() == 1 \|\| Operands.size() == 2) &&
	(Name == "stos" \|\| Name == "stosb" \|\| Name == "stosw" \|\|
	Name == "stosl" \|\| Name == "stosd" \|\| Name == "stosq")) {
	TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
	// values of $DIREG according to the mode. It would be nice if this
	// could be achieved with InstAlias in the tables.
	if (Name.startswith("scas") &&
	(Operands.size() == 1 \|\| Operands.size() == 2) &&
	(Name == "scas" \|\| Name == "scasb" \|\| Name == "scasw" \|\|
	Name == "scasl" \|\| Name == "scasd" \|\| Name == "scasq")) {
	TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Add default SI and DI operands to "cmps[bwlq]".
	if (Name.startswith("cmps") &&
	(Operands.size() == 1 \|\| Operands.size() == 3) &&
	(Name == "cmps" \|\| Name == "cmpsb" \|\| Name == "cmpsw" \|\|
	Name == "cmpsl" \|\| Name == "cmpsd" \|\| Name == "cmpsq")) {
	AddDefaultSrcDestOperands(TmpOperands, DefaultMemDIOperand(NameLoc),
	DefaultMemSIOperand(NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Add default SI and DI operands to "movs[bwlq]".
	if (((Name.startswith("movs") &&
	(Name == "movs" \|\| Name == "movsb" \|\| Name == "movsw" \|\|
	Name == "movsl" \|\| Name == "movsd" \|\| Name == "movsq")) \|\|
	(Name.startswith("smov") &&
	(Name == "smov" \|\| Name == "smovb" \|\| Name == "smovw" \|\|
	Name == "smovl" \|\| Name == "smovd" \|\| Name == "smovq"))) &&
	(Operands.size() == 1 \|\| Operands.size() == 3)) {
	if (Name == "movsd" && Operands.size() == 1 && !isParsingIntelSyntax())
	Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
	AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
	DefaultMemDIOperand(NameLoc));
	HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
	}

	// Check if we encountered an error for one the string insturctions
	if (HadVerifyError) {
	return HadVerifyError;
	}

	// FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to
	// "shift <op>".
	if ((Name.startswith("shr") \|\| Name.startswith("sar") \|\|
	Name.startswith("shl") \|\| Name.startswith("sal") \|\|
	Name.startswith("rcl") \|\| Name.startswith("rcr") \|\|
	Name.startswith("rol") \|\| Name.startswith("ror")) &&
	Operands.size() == 3) {
	if (isParsingIntelSyntax()) {
	// Intel syntax
	X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
	if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
	cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
	Operands.pop_back();
	} else {
	X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
	if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
	cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
	Operands.erase(Operands.begin() + 1);
	}
	}

	// Transforms "int $3" into "int3" as a size optimization. We can't write an
	// instalias with an immediate operand yet.
	if (Name == "int" && Operands.size() == 2) {
	X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
	if (Op1.isImm())
	if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
	if (CE->getValue() == 3) {
	Operands.erase(Operands.begin() + 1);
	static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
	}
	}

	// Transforms "xlat mem8" into "xlatb"
	if ((Name == "xlat" \|\| Name == "xlatb") && Operands.size() == 2) {
	X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
	if (Op1.isMem8()) {
	Warning(Op1.getStartLoc(), "memory operand is only for determining the "
	"size, (R\|E)BX will be used for the location");
	Operands.pop_back();
	static_cast<X86Operand &>(*Operands[0]).setTokenValue("xlatb");
	}
	}

	if (Flags)
	Operands.push_back(X86Operand::CreatePrefix(Flags, NameLoc, NameLoc));
	return false;
	}

	bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
	return false;
	}

	bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
	const MCRegisterInfo *MRI = getContext().getRegisterInfo();

	switch (Inst.getOpcode()) {
	case X86::VGATHERDPDYrm:
	case X86::VGATHERDPDrm:
	case X86::VGATHERDPSYrm:
	case X86::VGATHERDPSrm:
	case X86::VGATHERQPDYrm:
	case X86::VGATHERQPDrm:
	case X86::VGATHERQPSYrm:
	case X86::VGATHERQPSrm:
	case X86::VPGATHERDDYrm:
	case X86::VPGATHERDDrm:
	case X86::VPGATHERDQYrm:
	case X86::VPGATHERDQrm:
	case X86::VPGATHERQDYrm:
	case X86::VPGATHERQDrm:
	case X86::VPGATHERQQYrm:
	case X86::VPGATHERQQrm: {
	unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
	unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
	unsigned Index =
	MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg());
	if (Dest == Mask \|\| Dest == Index \|\| Mask == Index)
	return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
	"registers should be distinct");
	break;
	}
	case X86::VGATHERDPDZ128rm:
	case X86::VGATHERDPDZ256rm:
	case X86::VGATHERDPDZrm:
	case X86::VGATHERDPSZ128rm:
	case X86::VGATHERDPSZ256rm:
	case X86::VGATHERDPSZrm:
	case X86::VGATHERQPDZ128rm:
	case X86::VGATHERQPDZ256rm:
	case X86::VGATHERQPDZrm:
	case X86::VGATHERQPSZ128rm:
	case X86::VGATHERQPSZ256rm:
	case X86::VGATHERQPSZrm:
	case X86::VPGATHERDDZ128rm:
	case X86::VPGATHERDDZ256rm:
	case X86::VPGATHERDDZrm:
	case X86::VPGATHERDQZ128rm:
	case X86::VPGATHERDQZ256rm:
	case X86::VPGATHERDQZrm:
	case X86::VPGATHERQDZ128rm:
	case X86::VPGATHERQDZ256rm:
	case X86::VPGATHERQDZrm:
	case X86::VPGATHERQQZ128rm:
	case X86::VPGATHERQQZ256rm:
	case X86::VPGATHERQQZrm: {
	unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
	unsigned Index =
	MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg());
	if (Dest == Index)
	return Warning(Ops[0]->getStartLoc(), "index and destination registers "
	"should be distinct");
	break;
	}
	}

	return false;
	}

	static const char *getSubtargetFeatureName(uint64_t Val);

	void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
	MCStreamer &Out) {
	Instrumentation->InstrumentAndEmitInstruction(
	Inst, Operands, getContext(), MII, Out,
	getParser().shouldPrintSchedInfo());
	}

	bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands,
	MCStreamer &Out, uint64_t &ErrorInfo,
	bool MatchingInlineAsm) {
	if (isParsingIntelSyntax())
	return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
	MatchingInlineAsm);
	return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
	MatchingInlineAsm);
	}

	void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
	OperandVector &Operands, MCStreamer &Out,
	bool MatchingInlineAsm) {
	// FIXME: This should be replaced with a real .td file alias mechanism.
	// Also, MatchInstructionImpl should actually do the EmitInstruction
	// call.
	const char Repl = StringSwitch<const char >(Op.getToken())
	.Case("finit", "fninit")
	.Case("fsave", "fnsave")
	.Case("fstcw", "fnstcw")
	.Case("fstcww", "fnstcw")
	.Case("fstenv", "fnstenv")
	.Case("fstsw", "fnstsw")
	.Case("fstsww", "fnstsw")
	.Case("fclex", "fnclex")
	.Default(nullptr);
	if (Repl) {
	MCInst Inst;
	Inst.setOpcode(X86::WAIT);
	Inst.setLoc(IDLoc);
	if (!MatchingInlineAsm)
	EmitInstruction(Inst, Operands, Out);
	Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
	}
	}

	bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
	bool MatchingInlineAsm) {
	assert(ErrorInfo && "Unknown missing feature!");
	SmallString<126> Msg;
	raw_svector_ostream OS(Msg);
	OS << "instruction requires:";
	uint64_t Mask = 1;
	for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
	if (ErrorInfo & Mask)
	OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
	Mask <<= 1;
	}
	return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
	}

	static unsigned getPrefixes(OperandVector &Operands) {
	unsigned Result = 0;
	X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back());
	if (Prefix.isPrefix()) {
	Result = Prefix.getPrefix();
	Operands.pop_back();
	}
	return Result;
	}

	bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands,
	MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) {
	assert(!Operands.empty() && "Unexpect empty operand list!");
	X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
	assert(Op.isToken() && "Leading operand should always be a mnemonic!");
	SMRange EmptyRange = None;

	// First, handle aliases that expand to multiple instructions.
	MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);

	bool WasOriginallyInvalidOperand = false;
	unsigned Prefixes = getPrefixes(Operands);

	MCInst Inst;

	if (Prefixes)
	Inst.setFlags(Prefixes);

	// First, try a direct match.
	switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
	isParsingIntelSyntax())) {
	default: llvm_unreachable("Unexpected match result!");
	case Match_Success:
	if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
	return true;
	// Some instructions need post-processing to, for example, tweak which
	// encoding is selected. Loop on it while changes happen so the
	// individual transformations can chain off each other.
	if (!MatchingInlineAsm)
	while (processInstruction(Inst, Operands))
	;

	Inst.setLoc(IDLoc);
	if (!MatchingInlineAsm)
	EmitInstruction(Inst, Operands, Out);
	Opcode = Inst.getOpcode();
	return false;
	case Match_MissingFeature:
	return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
	case Match_InvalidOperand:
	WasOriginallyInvalidOperand = true;
	break;
	case Match_MnemonicFail:
	break;
	}

	// FIXME: Ideally, we would only attempt suffix matches for things which are
	// valid prefixes, and we could just infer the right unambiguous
	// type. However, that requires substantially more matcher support than the
	// following hack.

	// Change the operand to point to a temporary token.
	StringRef Base = Op.getToken();
	SmallString<16> Tmp;
	Tmp += Base;
	Tmp += ' ';
	Op.setTokenValue(Tmp);

	// If this instruction starts with an 'f', then it is a floating point stack
	// instruction. These come in up to three forms for 32-bit, 64-bit, and
	// 80-bit floating point, which use the suffixes s,l,t respectively.
	//
	// Otherwise, we assume that this may be an integer instruction, which comes
	// in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
	const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";

	// Check for the various suffix matches.
	uint64_t ErrorInfoIgnore;
	uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
	unsigned Match[4];

	for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
	Tmp.back() = Suffixes[I];
	Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
	MatchingInlineAsm, isParsingIntelSyntax());
	// If this returned as a missing feature failure, remember that.
	if (Match[I] == Match_MissingFeature)
	ErrorInfoMissingFeature = ErrorInfoIgnore;
	}

	// Restore the old token.
	Op.setTokenValue(Base);

	// If exactly one matched, then we treat that as a successful match (and the
	// instruction will already have been filled in correctly, since the failing
	// matches won't have modified it).
	unsigned NumSuccessfulMatches =
	std::count(std::begin(Match), std::end(Match), Match_Success);
	if (NumSuccessfulMatches == 1) {
	Inst.setLoc(IDLoc);
	if (!MatchingInlineAsm)
	EmitInstruction(Inst, Operands, Out);
	Opcode = Inst.getOpcode();
	return false;
	}

	// Otherwise, the match failed, try to produce a decent error message.

	// If we had multiple suffix matches, then identify this as an ambiguous
	// match.
	if (NumSuccessfulMatches > 1) {
	char MatchChars[4];
	unsigned NumMatches = 0;
	for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
	if (Match[I] == Match_Success)
	MatchChars[NumMatches++] = Suffixes[I];

	SmallString<126> Msg;
	raw_svector_ostream OS(Msg);
	OS << "ambiguous instructions require an explicit suffix (could be ";
	for (unsigned i = 0; i != NumMatches; ++i) {
	if (i != 0)
	OS << ", ";
	if (i + 1 == NumMatches)
	OS << "or ";
	OS << "'" << Base << MatchChars[i] << "'";
	}
	OS << ")";
	Error(IDLoc, OS.str(), EmptyRange, MatchingInlineAsm);
	return true;
	}

	// Okay, we know that none of the variants matched successfully.

	// If all of the instructions reported an invalid mnemonic, then the original
	// mnemonic was invalid.
	if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
	if (!WasOriginallyInvalidOperand) {
	return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
	Op.getLocRange(), MatchingInlineAsm);
	}

	// Recover location info for the operand if we know which was the problem.
	if (ErrorInfo != ~0ULL) {
	if (ErrorInfo >= Operands.size())
	return Error(IDLoc, "too few operands for instruction", EmptyRange,
	MatchingInlineAsm);

	X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
	if (Operand.getStartLoc().isValid()) {
	SMRange OperandRange = Operand.getLocRange();
	return Error(Operand.getStartLoc(), "invalid operand for instruction",
	OperandRange, MatchingInlineAsm);
	}
	}

	return Error(IDLoc, "invalid operand for instruction", EmptyRange,
	MatchingInlineAsm);
	}

	// If one instruction matched with a missing feature, report this as a
	// missing feature.
	if (std::count(std::begin(Match), std::end(Match),
	Match_MissingFeature) == 1) {
	ErrorInfo = ErrorInfoMissingFeature;
	return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
	MatchingInlineAsm);
	}

	// If one instruction matched with an invalid operand, report this as an
	// operand failure.
	if (std::count(std::begin(Match), std::end(Match),
	Match_InvalidOperand) == 1) {
	return Error(IDLoc, "invalid operand for instruction", EmptyRange,
	MatchingInlineAsm);
	}

	// If all of these were an outright failure, report it in a useless way.
	Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
	EmptyRange, MatchingInlineAsm);
	return true;
	}

	bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands,
	MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) {
	assert(!Operands.empty() && "Unexpect empty operand list!");
	X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
	assert(Op.isToken() && "Leading operand should always be a mnemonic!");
	StringRef Mnemonic = Op.getToken();
	SMRange EmptyRange = None;
	StringRef Base = Op.getToken();
	unsigned Prefixes = getPrefixes(Operands);

	// First, handle aliases that expand to multiple instructions.
	MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);

	MCInst Inst;

	if (Prefixes)
	Inst.setFlags(Prefixes);

	// Find one unsized memory operand, if present.
	X86Operand *UnsizedMemOp = nullptr;
	for (const auto &Op : Operands) {
	X86Operand X86Op = static_cast<X86Operand >(Op.get());
	if (X86Op->isMemUnsized()) {
	UnsizedMemOp = X86Op;
	// Have we found an unqualified memory operand,
	// break. IA allows only one memory operand.
	break;
	}
	}

	// Allow some instructions to have implicitly pointer-sized operands. This is
	// compatible with gas.
	if (UnsizedMemOp) {
	static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
	for (const char *Instr : PtrSizedInstrs) {
	if (Mnemonic == Instr) {
	UnsizedMemOp->Mem.Size = getPointerWidth();
	break;
	}
	}
	}

	SmallVector<unsigned, 8> Match;
	uint64_t ErrorInfoMissingFeature = 0;

	// If unsized push has immediate operand we should default the default pointer
	// size for the size.
	if (Mnemonic == "push" && Operands.size() == 2) {
	auto X86Op = static_cast<X86Operand >(Operands[1].get());
	if (X86Op->isImm()) {
	// If it's not a constant fall through and let remainder take care of it.
	const auto *CE = dyn_cast<MCConstantExpr>(X86Op->getImm());
	unsigned Size = getPointerWidth();
	if (CE &&
	(isIntN(Size, CE->getValue()) \|\| isUIntN(Size, CE->getValue()))) {
	SmallString<16> Tmp;
	Tmp += Base;
	Tmp += (is64BitMode())
	? "q"
	: (is32BitMode()) ? "l" : (is16BitMode()) ? "w" : " ";
	Op.setTokenValue(Tmp);
	// Do match in ATT mode to allow explicit suffix usage.
	Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
	MatchingInlineAsm,
	false /isParsingIntelSyntax()/));
	Op.setTokenValue(Base);
	}
	}
	}

	// If an unsized memory operand is present, try to match with each memory
	// operand size. In Intel assembly, the size is not part of the instruction
	// mnemonic.
	if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
	static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
	for (unsigned Size : MopSizes) {
	UnsizedMemOp->Mem.Size = Size;
	uint64_t ErrorInfoIgnore;
	unsigned LastOpcode = Inst.getOpcode();
	unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
	MatchingInlineAsm, isParsingIntelSyntax());
	if (Match.empty() \|\| LastOpcode != Inst.getOpcode())
	Match.push_back(M);

	// If this returned as a missing feature failure, remember that.
	if (Match.back() == Match_MissingFeature)
	ErrorInfoMissingFeature = ErrorInfoIgnore;
	}

	// Restore the size of the unsized memory operand if we modified it.
	UnsizedMemOp->Mem.Size = 0;
	}

	// If we haven't matched anything yet, this is not a basic integer or FPU
	// operation. There shouldn't be any ambiguity in our mnemonic table, so try
	// matching with the unsized operand.
	if (Match.empty()) {
	Match.push_back(MatchInstruction(
	Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()));
	// If this returned as a missing feature failure, remember that.
	if (Match.back() == Match_MissingFeature)
	ErrorInfoMissingFeature = ErrorInfo;
	}

	// Restore the size of the unsized memory operand if we modified it.
	if (UnsizedMemOp)
	UnsizedMemOp->Mem.Size = 0;

	// If it's a bad mnemonic, all results will be the same.
	if (Match.back() == Match_MnemonicFail) {
	return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
	Op.getLocRange(), MatchingInlineAsm);
	}

	unsigned NumSuccessfulMatches =
	std::count(std::begin(Match), std::end(Match), Match_Success);

	// If matching was ambiguous and we had size information from the frontend,
	// try again with that. This handles cases like "movxz eax, m8/m16".
	if (UnsizedMemOp && NumSuccessfulMatches > 1 &&
	UnsizedMemOp->getMemFrontendSize()) {
	UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize();
	unsigned M = MatchInstruction(
	Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax());
	if (M == Match_Success)
	NumSuccessfulMatches = 1;

	// Add a rewrite that encodes the size information we used from the
	// frontend.
	InstInfo->AsmRewrites->emplace_back(
	AOK_SizeDirective, UnsizedMemOp->getStartLoc(),
	/Len=/0, UnsizedMemOp->getMemFrontendSize());
	}

	// If exactly one matched, then we treat that as a successful match (and the
	// instruction will already have been filled in correctly, since the failing
	// matches won't have modified it).
	if (NumSuccessfulMatches == 1) {
	if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
	return true;
	// Some instructions need post-processing to, for example, tweak which
	// encoding is selected. Loop on it while changes happen so the individual
	// transformations can chain off each other.
	if (!MatchingInlineAsm)
	while (processInstruction(Inst, Operands))
	;
	Inst.setLoc(IDLoc);
	if (!MatchingInlineAsm)
	EmitInstruction(Inst, Operands, Out);
	Opcode = Inst.getOpcode();
	return false;
	} else if (NumSuccessfulMatches > 1) {
	assert(UnsizedMemOp &&
	"multiple matches only possible with unsized memory operands");
	return Error(UnsizedMemOp->getStartLoc(),
	"ambiguous operand size for instruction '" + Mnemonic + "\'",
	UnsizedMemOp->getLocRange());
	}

	// If one instruction matched with a missing feature, report this as a
	// missing feature.
	if (std::count(std::begin(Match), std::end(Match),
	Match_MissingFeature) == 1) {
	ErrorInfo = ErrorInfoMissingFeature;
	return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
	MatchingInlineAsm);
	}

	// If one instruction matched with an invalid operand, report this as an
	// operand failure.
	if (std::count(std::begin(Match), std::end(Match),
	Match_InvalidOperand) == 1) {
	return Error(IDLoc, "invalid operand for instruction", EmptyRange,
	MatchingInlineAsm);
	}

	// If all of these were an outright failure, report it in a useless way.
	return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
	MatchingInlineAsm);
	}

	bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
	return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
	}

	bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
	MCAsmParser &Parser = getParser();
	StringRef IDVal = DirectiveID.getIdentifier();
	if (IDVal == ".word")
	return ParseDirectiveWord(2, DirectiveID.getLoc());
	else if (IDVal.startswith(".code"))
	return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
	else if (IDVal.startswith(".att_syntax")) {
	getParser().setParsingInlineAsm(false);
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	if (Parser.getTok().getString() == "prefix")
	Parser.Lex();
	else if (Parser.getTok().getString() == "noprefix")
	return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
	"supported: registers must have a "
	"'%' prefix in .att_syntax");
	}
	getParser().setAssemblerDialect(0);
	return false;
	} else if (IDVal.startswith(".intel_syntax")) {
	getParser().setAssemblerDialect(1);
	getParser().setParsingInlineAsm(true);
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	if (Parser.getTok().getString() == "noprefix")
	Parser.Lex();
	else if (Parser.getTok().getString() == "prefix")
	return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
	"supported: registers must not have "
	"a '%' prefix in .intel_syntax");
	}
	return false;
	} else if (IDVal == ".even")
	return parseDirectiveEven(DirectiveID.getLoc());
	else if (IDVal == ".cv_fpo_proc")
	return parseDirectiveFPOProc(DirectiveID.getLoc());
	else if (IDVal == ".cv_fpo_setframe")
	return parseDirectiveFPOSetFrame(DirectiveID.getLoc());
	else if (IDVal == ".cv_fpo_pushreg")
	return parseDirectiveFPOPushReg(DirectiveID.getLoc());
	else if (IDVal == ".cv_fpo_stackalloc")
	return parseDirectiveFPOStackAlloc(DirectiveID.getLoc());
	else if (IDVal == ".cv_fpo_endprologue")
	return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
	else if (IDVal == ".cv_fpo_endproc")
	return parseDirectiveFPOEndProc(DirectiveID.getLoc());

	return true;
	}

	/// parseDirectiveEven
	/// ::= .even
	bool X86AsmParser::parseDirectiveEven(SMLoc L) {
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	TokError("unexpected token in directive");
	return false;
	}
	const MCSection *Section = getStreamer().getCurrentSectionOnly();
	if (!Section) {
	getStreamer().InitSections(false);
	Section = getStreamer().getCurrentSectionOnly();
	}
	if (Section->UseCodeAlign())
	getStreamer().EmitCodeAlignment(2, 0);
	else
	getStreamer().EmitValueToAlignment(2, 0, 1, 0);
	return false;
	}
	/// ParseDirectiveWord
	/// ::= .word [ expression (, expression)* ]
	bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
	MCAsmParser &Parser = getParser();
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	for (;;) {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (getParser().parseExpression(Value))
	return false;

	if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
	assert(Size <= 8 && "Invalid size");
	uint64_t IntValue = MCE->getValue();
	if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
	return Error(ExprLoc, "literal value out of range for directive");
	getStreamer().EmitIntValue(IntValue, Size);
	} else {
	getStreamer().EmitValue(Value, Size, ExprLoc);
	}

	if (getLexer().is(AsmToken::EndOfStatement))
	break;

	// FIXME: Improve diagnostic.
	if (getLexer().isNot(AsmToken::Comma)) {
	Error(L, "unexpected token in directive");
	return false;
	}
	Parser.Lex();
	}
	}

	Parser.Lex();
	return false;
	}

	/// ParseDirectiveCode
	/// ::= .code16 \| .code32 \| .code64
	bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
	MCAsmParser &Parser = getParser();
	Code16GCC = false;
	if (IDVal == ".code16") {
	Parser.Lex();
	if (!is16BitMode()) {
	SwitchMode(X86::Mode16Bit);
	getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
	}
	} else if (IDVal == ".code16gcc") {
	// .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
	Parser.Lex();
	Code16GCC = true;
	if (!is16BitMode()) {
	SwitchMode(X86::Mode16Bit);
	getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
	}
	} else if (IDVal == ".code32") {
	Parser.Lex();
	if (!is32BitMode()) {
	SwitchMode(X86::Mode32Bit);
	getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
	}
	} else if (IDVal == ".code64") {
	Parser.Lex();
	if (!is64BitMode()) {
	SwitchMode(X86::Mode64Bit);
	getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
	}
	} else {
	Error(L, "unknown directive " + IDVal);
	return false;
	}

	return false;
	}

	// .cv_fpo_proc foo
	bool X86AsmParser::parseDirectiveFPOProc(SMLoc L) {
	MCAsmParser &Parser = getParser();
	StringRef ProcName;
	int64_t ParamsSize;
	if (Parser.parseIdentifier(ProcName))
	return Parser.TokError("expected symbol name");
	if (Parser.parseIntToken(ParamsSize, "expected parameter byte count"))
	return true;
	if (!isUIntN(32, ParamsSize))
	return Parser.TokError("parameters size out of range");
	if (Parser.parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_proc' directive");
	MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
	return getTargetStreamer().emitFPOProc(ProcSym, ParamsSize, L);
	}

	// .cv_fpo_setframe ebp
	bool X86AsmParser::parseDirectiveFPOSetFrame(SMLoc L) {
	MCAsmParser &Parser = getParser();
	unsigned Reg;
	SMLoc DummyLoc;
	if (ParseRegister(Reg, DummyLoc, DummyLoc) \|\|
	Parser.parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_setframe' directive");
	return getTargetStreamer().emitFPOSetFrame(Reg, L);
	}

	// .cv_fpo_pushreg ebx
	bool X86AsmParser::parseDirectiveFPOPushReg(SMLoc L) {
	MCAsmParser &Parser = getParser();
	unsigned Reg;
	SMLoc DummyLoc;
	if (ParseRegister(Reg, DummyLoc, DummyLoc) \|\|
	Parser.parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_pushreg' directive");
	return getTargetStreamer().emitFPOPushReg(Reg, L);
	}

	// .cv_fpo_stackalloc 20
	bool X86AsmParser::parseDirectiveFPOStackAlloc(SMLoc L) {
	MCAsmParser &Parser = getParser();
	int64_t Offset;
	if (Parser.parseIntToken(Offset, "expected offset") \|\|
	Parser.parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_stackalloc' directive");
	return getTargetStreamer().emitFPOStackAlloc(Offset, L);
	}

	// .cv_fpo_endprologue
	bool X86AsmParser::parseDirectiveFPOEndPrologue(SMLoc L) {
	MCAsmParser &Parser = getParser();
	if (Parser.parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_endprologue' directive");
	return getTargetStreamer().emitFPOEndPrologue(L);
	}

	// .cv_fpo_endproc
	bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
	MCAsmParser &Parser = getParser();
	if (Parser.parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_endproc' directive");
	return getTargetStreamer().emitFPOEndProc(L);
	}

	// Force static initialization.
	extern "C" void LLVMInitializeX86AsmParser() {
	RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
	RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target());
	}

	#define GET_REGISTER_MATCHER
	#define GET_MATCHER_IMPLEMENTATION
	#define GET_SUBTARGET_FEATURE_NAME
	#include "X86GenAsmMatcher.inc"
	Index: vendor/llvm/dist-release_60/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/X86/X86ISelLowering.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/X86/X86ISelLowering.cpp (revision 328362)
	@@ -1,38714 +1,38736 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::ABS , MVT::i64 , Custom);
	}

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	if (X86ScalarSSEf32) {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	// Expand FP immediates into loads from the stack, except for the special
	// cases we handle.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	} else if (UseX87 && X86ScalarSSEf32) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	// Special cases we handle for FP constants.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, &X86::FR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v16i8, Legal);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);

	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
	}

	if (HasInt256) {
	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	if (Subtarget.hasVLX()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
	}

	// Extends of v16i1/v8i1 to 128-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);

	for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
	for (auto VT : { MVT::v1i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
	MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
	MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
	setTruncStoreAction(VT, MaskVT, Custom);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

	if (!Subtarget.hasVLX()) {
	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	}

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);

	setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Need to promote to 64-bit even though we have 32-bit masked instructions
	// because the IR optimizers rearrange bitcasts around logic ops leaving
	// too many variations to handle if we don't promote them.
	setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
	}
	}// has AVX-512

	if (!Subtarget.useSoftFloat() &&
	(Subtarget.hasAVX512() \|\| Subtarget.hasVLX())) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);

	setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
	(Subtarget.hasAVX512() \|\| Subtarget.hasVLX())) {
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

	for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	// TODO: v8i1 concat should be legal without VLX to support concats of
	// v1i1, but we won't legalize it correctly currently without introducing
	// a v4i1 concat in the middle.
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
	for (auto VT : { MVT::v2i1, MVT::v4i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v2i1/v4i1 masks to 128-bit vectors.
	setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);

	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(EVT VT) const {
	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType().getSimpleVT() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	X86TargetLowering::getOptimalMemOpType(uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX()) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2())
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	if (Subtarget.hasSSE1())
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool
	X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isPointerTy() \|\| T->isIntegerTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	auto *SecurityCheckCookie = cast<Function>(
	M.getOrInsertFunction("__security_check_cookie",
	Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext())));
	SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
	SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "X86GenCallingConv.inc"

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	} else
	return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	unsigned Reg;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register
	Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// Calculate SP offset of interrupt parameter, re-arrange the slot normally
	// taken by a return address.
	int Offset = 0;
	if (CallConv == CallingConv::X86_INTR) {
	// X86 interrupts may take one or two arguments.
	// On the stack there will be no return address as in regular call.
	// Offset of last argument need to be set to -4/-8 bytes.
	// Where offset of the first argument out of two, should be set to 0 bytes.
	Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
	if (Subtarget.is64Bit() && Ins.size() == 2) {
	// The stack pointer needs to be realigned for 64 bit handlers with error
	// code, so the argument offset changes by 8 bytes.
	Offset += 8;
	}
	}

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	if (Flags.isCopyElisionCandidate()) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/Immutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	if (CallConv == CallingConv::X86_INTR) {
	bool isLegal = Ins.size() == 1 \|\|
	(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) \|\|
	(!Is64Bit && Ins[1].VT == MVT::i32)));
	if (!isLegal)
	report_fatal_error("X86 interrupts may take one or two arguments");
	}

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::FR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect)
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &F : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
	Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress) {
	// If the callee is a GlobalAddress node (quite common, every direct call
	// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
	// it.
	GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

	// We should use extra load for direct calls to dllimported functions in
	// non-JIT mode.
	const GlobalValue *GV = G->getGlobal();
	if (!GV->hasDLLImportStorageClass()) {
	unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);

	Callee = DAG.getTargetGlobalAddress(
	GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);

	if (OpFlags == X86II::MO_GOTPCREL) {
	// Add a wrapper.
	Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
	getPointerTy(DAG.getDataLayout()), Callee);
	// Add extra indirection
	Callee = DAG.getLoad(
	getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags =
	Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);

	Callee = DAG.getTargetExternalSymbol(
	S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
	memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	}

	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLPS:
	case X86ISD::MOVLPD:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// \brief Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case EXPAND_FROM_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getType());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case COMPRESS_TO_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
	return true;
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
	EVT BitcastVT) const {
	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
	return false;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	EVT VT = Y.getValueType();
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return true;
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size]. or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
	unsigned Pos, unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// \brief Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	assert((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
	}

	// Return true if the instruction zeroes the unused upper part of the
	// destination and accepts mask.
	static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
	switch (Opcode) {
	default:
	return false;
	case X86ISD::TESTM:
	case X86ISD::TESTNM:
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	case X86ISD::CMPM_RND:
	return true;
	}
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to opimitize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	// Move the current value of the bit to be replace to the lsbs.
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
	// Shift to MSB, filling bottom bits with 0.
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	// Xor with original vector leaving the new value.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
	/// instructions. This is used because creating CONCAT_VECTOR nodes of
	/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
	/// large BUILD_VECTORS.
	static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
	SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode");

	if (VT.is128BitVector() && InVT.is128BitVector())
	return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
	: DAG.getZeroExtendVectorInReg(In, DL, VT);

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
	int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128, (int)VT.getSizeInBits() / Scale));
	}

	return DAG.getNode(Opc, DL, VT, In);
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static SDValue peekThroughBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	static SDValue peekThroughOneUseBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
	V.getOperand(0).hasOneUse())
	V = V.getOperand(0);
	return V;
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);

	auto *Load = dyn_cast<LoadSDNode>(Op);
	if (!Load)
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry())
	return nullptr;

	return dyn_cast<Constant>(CNode->getConstVal());
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
	EltBits[i] = Bits.getZExtValue();
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	if (!CstTy->isVectorTy() \|\| (SizeInBits != CstTy->getPrimitiveSizeInBits()))
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = CstTy->getVectorNumElements();

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;

	// Extract the raw target constant bits.
	// FIXME: We currently don't support UNDEF bits or mask entries.
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ false,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch(N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(VT, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMILPMask(VT, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMILPMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodePSHUFBMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodePSHUFBMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVLPD:
	case X86ISD::MOVLPS:
	// Not yet implemented
	return false;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodeVPPERMMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPPERMMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	SmallVector<uint64_t, 32> RawMask;
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMVMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMVMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMIV3: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(0);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
	"Expected byte aligned value types");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) \|\|
	(N0.getOpcode() == X86ISD::PEXTRW &&
	N0.getOperand(0).getValueType() == MVT::v8i16) \|\|
	(N0.getOpcode() == X86ISD::PEXTRB &&
	N0.getOperand(0).getValueType() == MVT::v16i8)) {
	SrcExtract = N0;
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	uint64_t InIdx = N.getConstantOperandVal(2);
	assert(InIdx < NumElts && "Illegal insertion index");

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(PEXTR) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0);
	uint64_t ExIdx = InScl.getConstantOperandVal(1);
	assert(ExIdx < NumElts && "Illegal extraction index");
	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) \|\|
	(!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) \|\|
	(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VZEXT: {
	// TODO - add support for VPMOVZX with smaller input vector types.
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (NumSizeInBits != SrcVT.getSizeInBits())
	break;
	DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	UsedInputs.push_back(Inputs[i]);
	continue;
	}
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; ++i) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	if (ThisIsNonZero && First) {
	if (NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else
	V = DAG.getUNDEF(MVT::v8i16);
	First = false;
	}

	if ((i & 1) != 0) {
	// FIXME: Investigate extending to i32 instead of just i16.
	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue ThisElt, LastElt;
	bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
	if (LastIsNonZero) {
	LastElt =
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
	}
	if (ThisIsNonZero) {
	ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
	ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (LastIsNonZero)
	ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
	} else
	ThisElt = LastElt;

	if (ThisElt) {
	if (1 == i) {
	V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
	: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	} else {
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
	DAG.getIntPtrConstant(i / 2, dl));
	}
	}
	}
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Find all zeroable elements.
	std::bitset<4> Zeroable;
	for (int i=0; i < 4; ++i) {
	SDValue Elt = Op->getOperand(i);
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i=0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op->getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	SmallBitVector LoadMask(NumElems, false);
	SmallBitVector ZeroMask(NumElems, false);
	SmallBitVector UndefMask(NumElems, false);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();

	if (Elt.isUndef())
	UndefMask[i] = true;
	else if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode()))
	ZeroMask[i] = true;
	else if (ISD::isNON_EXTLoad(Elt.getNode())) {
	LoadMask[i] = true;
	LastLoadedElt = i;
	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();
	} else
	return SDValue();
	}
	assert((ZeroMask \| UndefMask \| LoadMask).count() == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.count() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask \| UndefMask).count() == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.find_first();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
	EVT LDBaseVT = EltBase.getValueType();

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	LoadSDNode *LD = cast<LoadSDNode>(Elt);
	if (!DAG.areNonVolatileConsecutiveLoads(
	LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	SmallVector<LoadSDNode *, 8> Loads;
	for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
	if (LoadMask[i])
	Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	for (auto *LD : Loads)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// LOAD - all consecutive load/undefs (must start/end with a load).
	// If we have found an entire vector of loads and undefs, then return a large
	// load of the entire vector width starting at the base pointer.
	// If the vector contains zeros, then attempt to shuffle those elements.
	if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	assert(LDBase && "Did not find base load for merging consecutive loads");
	EVT EltVT = LDBase->getValueType(0);
	// Ensure that the input vector size for the merged loads matches the
	// cumulative size of the input elements.
	if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
	return SDValue();

	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (IsConsecutiveLoad)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	int LoadSize =
	(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSize == 32 \|\| LoadSize == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
	: MVT::getIntegerVT(LoadSize);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	return SDValue();
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	if (isTargetShuffle(U->getOpcode()))
	return true;
	if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
	return isUseOfShuffle(U);
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this paterrn:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	if (BOperand.getValueType().isVector() &&
	BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
	if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 \|\|
	NumElts == 8)) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 \|\|
	NumElts == 16))) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	}

	// We need a splat of a single value to use broadcast, and it doesn't
	// make any sense if the value is only in one element of the vector.
	if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isUseOfShuffle(BVOp) \|\| BVOp->hasOneUse())
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	// Split the pieces.
	SDValue Lower =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
	SDValue Upper =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
	// We have to manually lower both halves so getNode doesn't try to
	// reassemble the build_vector.
	Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
	Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
	}
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// \brief Return true if \p N implements a horizontal binop and return the
	/// operands for the horizontal binop into V0 and V1.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
	/// operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);

	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	static bool isAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts) {

	MVT VT = BV->getSimpleValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting two integer/float elements.
	unsigned ExpectedOpcode = ISD::FSUB;
	unsigned NextExpectedOpcode = ISD::FADD;
	bool AddFound = false;
	bool SubFound = false;

	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF) {
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	continue;
	}

	// Early exit if we found an unexpected opcode.
	if (Opcode != ExpectedOpcode)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node. Update the information accordingly.
	if (i & 1)
	AddFound = true;
	else
	SubFound = true;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (ExpectedOpcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Update the pair of expected opcodes.
	std::swap(ExpectedOpcode, NextExpectedOpcode);

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
	if (!AddFound \|\| !SubFound \|\| InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
	/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	// TODO: According to coverage reports, the FMADDSUB transform is not
	// triggered by any tests.
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = BV->getSimpleValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	unsigned Half = NumElts/2;

	// Count the number of UNDEF operands in the build_vector in input.
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	// Early exit if this is either a build_vector of all UNDEFs or all the
	// operands but one are UNDEF.
	if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
	return SDValue();

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if ((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) {
	// Try to match an SSE3 float HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if ((VT == MVT::v4i32 \|\| VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
	// Try to match an SSSE3 integer HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
	}

	if (!Subtarget.hasAVX())
	return SDValue();

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64)) {
	// Try to match an AVX horizontal add/sub of packed single/double
	// precision floating point values from 256-bit vectors.
	SDValue InVec2, InVec3;
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	// Try to match an AVX2 horizontal add/sub of signed integers.
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Fold this build_vector into a single horizontal add/sub.
	// Do this only if the target has AVX2.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binop followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
	isUndefLO, isUndefHI);
	}
	}

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) && Subtarget.hasAVX()) {
	unsigned X86Opcode;
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();
	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
	// when no native operation available.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Look for VPERMV and PSHUFB opportunities.
	MVT VT = V.getSimpleValueType();
	switch (VT.SimpleTy) {
	default:
	return SDValue();
	case MVT::v16i8:
	if (!Subtarget.hasSSE3())
	return SDValue();
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (!Subtarget.hasAVX2())
	return SDValue();
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (!Subtarget.hasVLX())
	return SDValue();
	break;
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512())
	return SDValue();
	break;
	case MVT::v32i16:
	if (!Subtarget.hasBWI())
	return SDValue();
	break;
	case MVT::v8i16:
	case MVT::v16i16:
	if (!Subtarget.hasVLX() \|\| !Subtarget.hasBWI())
	return SDValue();
	break;
	case MVT::v64i8:
	if (!Subtarget.hasVBMI())
	return SDValue();
	break;
	case MVT::v32i8:
	if (!Subtarget.hasVLX() \|\| !Subtarget.hasVBMI())
	return SDValue();
	break;
	}
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getZExtValue() != Idx)
	return SDValue();
	}
	MVT IndicesVT = VT;
	if (VT.isFloatingPoint())
	IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
	- return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
	- SDLoc(V), VT, IndicesVec, SrcVec);
	+ if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
	+ SrcVec =
	+ DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
	+ SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
	+ }
	+ if (VT == MVT::v16i8)
	+ return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
	+ return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT ExtVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	// TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
	// transform here.
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = ExtVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, we assume that we will fall back to a shuffle to get the scalar
	// blended with the constants. Insertion into a zero vector is handled as a
	// special-case somewhere below here.
	LLVMContext &Context = *DAG.getContext();
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If this is an insertion of an i64 value on x86-32, and if the top bits of
	// the value are obviously zero, truncate the value to i32 and do the
	// insertion that way. Only do this if the value is non-constant or if the
	// value is a constant being inserted into element 0. It is cheaper to do
	// a constant pool load than it is to do a movd + shuffle.
	if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
	(!IsAllConstants \|\| Idx == 0)) {
	if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
	// Handle SSE only.
	assert(VT == MVT::v2i64 && "Expected an SSE value type!");
	MVT VecVT = MVT::v4i32;

	// Truncate the value (which may itself be a constant) to i32, and
	// convert it to a vector with movd (S2V+shuffle to zero extend).
	Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
	return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
	Item, Idx * 2, true, Subtarget, DAG));
	}
	}

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
	(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	if (VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	if (VT.is256BitVector())
	return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	if (ResVT.is256BitVector())
	return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

	if (Op.getNumOperands() == 4) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SDValue V3 = Op.getOperand(2);
	SDValue V4 = Op.getOperand(3);
	return concat256BitVectors(
	concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
	concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
	NumElems, DAG, dl);
	}
	return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
	}

	// Return true if all the operands of the given CONCAT_VECTORS node are zeros
	// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
	static bool isExpandWithZeros(const SDValue &Op) {
	assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
	"Expand with zeros only possible in CONCAT_VECTORS nodes!");

	for (unsigned i = 1; i < Op.getNumOperands(); i++)
	if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
	return false;

	return true;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
	unsigned Opc = Op.getOpcode();

	assert(Opc == ISD::CONCAT_VECTORS &&
	Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected node to check for type promotion!");

	// As long as we are concatenating zeros to the upper part of a previous node
	// result, climb up the tree until a node with different opcode is
	// encountered
	while (Opc == ISD::INSERT_SUBVECTOR \|\| Opc == ISD::CONCAT_VECTORS) {
	if (Opc == ISD::INSERT_SUBVECTOR) {
	if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
	Op.getConstantOperandVal(2) == 0)
	Op = Op.getOperand(1);
	else
	return SDValue();
	} else { // Opc == ISD::CONCAT_VECTORS
	if (isExpandWithZeros(Op))
	Op = Op.getOperand(0);
	else
	return SDValue();
	}
	Opc = Op.getOpcode();
	}

	// Check if the first inserted node zeroes the upper bits, or an 'and' result
	// of a node that zeros the upper bits (its masked version).
	if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) \|\|
	(Op.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	// If this node promotes - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark it as legal and catch the pattern in instruction
	// selection to avoid emitting extra instructions (for zeroing upper bits).
	if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
	SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
	SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
	ZeroC);
	}

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= (uint64_t)1 << i;
	++NumNonZero;
	}
	}


	// If there are zero or one non-zeros we can handle this very simply.
	if (NumNonZero <= 1) {
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);
	if (!NumNonZero)
	return Vec;
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(NumNonZero == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	unsigned NumElems = ResVT.getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// \brief Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// \brief Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// \brief Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getSelect(DL, VT, VMask,
	DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
	ZeroVector);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
	SDValue &V2, unsigned &PackOpcode,
	ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

	auto MatchPACK = [&](SDValue N1, SDValue N2) {
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > BitSize) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > BitSize)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}

	if (Subtarget.hasSSE41() \|\| PackSVT == MVT::i16) {
	APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}

	return false;
	};

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
	if (MatchPACK(V1, V2))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
	if (MatchPACK(V1, V1))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget))
	return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
	DAG.getBitcast(PackVT, V2));

	return SDValue();
	}

	/// \brief Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() && "Floating point types are not supported");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
	return DAG.getNode(ISD::AND, DL, VT, V, VMask);
	}

	/// \brief Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	// We have to cast V2 around.
	MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
	V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
	DAG.getBitcast(MaskVT, V1Mask),
	DAG.getBitcast(MaskVT, V2)));
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// \brief Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v2f64:
	case MVT::v4f32:
	case MVT::v4f64:
	case MVT::v8f32:
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));

	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v2i64:
	case MVT::v4i32:
	// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
	// that instruction.
	if (Subtarget.hasAVX2()) {
	// Scale the blend by the number of 32-bit dwords per element.
	int Scale = VT.getScalarSizeInBits() / 32;
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}
	LLVM_FALLTHROUGH;
	case MVT::v8i16: {
	// For integer shuffles we need to expand the mask and cast the inputs to
	// v8i16s prior to blending.
	int Scale = 8 / VT.getVectorNumElements();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = DAG.getBitcast(MVT::v8i16, V2);
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}

	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v16i8:
	case MVT::v32i8: {
	assert((VT.is128BitVector() \|\| Subtarget.hasAVX2()) &&
	"256-bit byte-blends require AVX2 support!");

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return Masked;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// \brief Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// \brief Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
	MVT VT, SDValue V1,
	SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend strategy unless one of the
	// input shuffles would be a no-op. We prefer to shuffle inputs as the
	// shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, blending
	// first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
	if (SDValue BlendPerm =
	lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
	return BlendPerm;

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// \brief Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// \brief Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// \brief Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask, int MaskOffset,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchVectorShuffleAsShift(
	ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt =
	matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// \brief Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) \|\|
	!SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// \brief Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// \brief Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// \brief Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerVectorShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
	DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
	DAG.getDataLayout(), VT)));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
	SDValue V0, int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// \brief Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	// Peek through bitcasts as long as BroadcastIdx can be adjusted.
	SDValue VSrc = V.getOperand(0);
	unsigned NumEltBits = V.getScalarValueSizeInBits();
	unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
	if ((NumEltBits % NumSrcBits) == 0)
	BroadcastIdx *= (NumEltBits / NumSrcBits);
	else if ((NumSrcBits % NumEltBits) == 0 &&
	(BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
	BroadcastIdx /= (NumSrcBits / NumEltBits);
	else
	break;
	V = VSrc;
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OperandSize = Mask.size() / V.getNumOperands();
	V = V.getOperand(BroadcastIdx / OperandSize);
	BroadcastIdx %= OperandSize;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int BeginIdx = (int)ConstantIdx->getZExtValue();
	int EndIdx =
	BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
	if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
	BroadcastIdx -= BeginIdx;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}

	// Ensure the source vector and BroadcastIdx are for a suitable type.
	if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = V.getScalarValueSizeInBits();
	if ((NumSrcBits % NumEltBits) == 0)
	BroadcastIdx *= (NumSrcBits / NumEltBits);
	else if ((NumEltBits % NumSrcBits) == 0 &&
	(BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
	BroadcastIdx /= (NumEltBits / NumSrcBits);
	else
	return SDValue();

	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
	V = DAG.getBitcast(SrcVT, V);
	}

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// First, look through bitcast: if the original value has a larger element
	// type than the shuffle, the broadcast element is in essence truncated.
	// Make that explicit to ease folding.
	if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
	if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
	DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Peek through any bitcast (only useful for loads).
	SDValue BC = peekThroughBitcasts(V);

	// Also check the simpler case, where we can directly reuse the scalar.
	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: Opcode;
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(BC);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BroadcastIdx != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (((BroadcastIdx * EltSize) % 128) != 0)
	return SDValue();

	// The shuffle input might have been a bitcast we looked through; look at
	// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
	// later bitcast it to BroadcastVT.
	assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	V = extract128BitVector(V, BroadcastIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	SrcVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(SrcVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (SrcVT.getSizeInBits() > 128) {
	MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
	128 / SrcVT.getScalarSizeInBits());
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
	V = DAG.getBitcast(ExtVT, V);
	}

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// \brief Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
	assert(Mask[1] >= 2 && "Non-canonicalized blend!");

	// If we have a single input, insert that into V1 if we can do so cheaply.
	if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;
	}

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
	DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
	Mask, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// \brief Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// \brief Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// \brief Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V =
	lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
	DL, MVT::v4f32, V1, V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// \brief Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
	Mask, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v4i32, V1, V2, Mask, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL =
	std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH =
	std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord, BDWord;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
	DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
	bool &V2InUse) {
	SDValue V1Mask[16];
	SDValue V2Mask[16];
	V1InUse = false;
	V2InUse = false;

	int Size = Mask.size();
	int Scale = 16 / Size;
	for (int i = 0; i < 16; ++i) {
	if (Mask[i / Scale] < 0) {
	V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
	} else {
	const int ZeroMask = 0x80;
	int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
	: ZeroMask;
	int V2Idx = Mask[i / Scale] < Size
	? ZeroMask
	: (Mask[i / Scale] - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;
	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}
	}

	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V1),
	DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V2),
	DAG.getBuildVector(MVT::v16i8, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// \brief Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
	Mask, Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
	MutableMask, Subtarget,
	DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
	V2, Mask, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, DAG);
	}

	/// \brief Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// \brief Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return BitBlend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
	Mask, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// \brief Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// \brief Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
	}

	/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] / LaneSize)] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
	{ 2, 3, 0, 1 });
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// \brief Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This will only succeed when the result of fixing the 128-bit lanes results
	/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
	/// each 128-bit lanes. This handles many cases where we can quickly blend away
	/// the lane crosses early and then use simpler shuffles within each lane.
	///
	/// FIXME: It might be worthwhile at some point to support this without
	/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
	/// in x86 only floating point has interesting non-repeating shuffles, and even
	/// those are still marginally more expensive.
	static SDValue lowerVectorShuffleByMerging128BitLanes(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int NumLanes = Size / LaneSize;
	assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");

	// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
	// check whether the in-128-bit lane shuffles share a repeating pattern.
	SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
	SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	int j = i / LaneSize;

	if (Lanes[j] < 0) {
	// First entry we've seen for this lane.
	Lanes[j] = Mask[i] / LaneSize;
	} else if (Lanes[j] != Mask[i] / LaneSize) {
	// This doesn't match the lane selected previously!
	return SDValue();
	}

	// Check that within each lane we have a consistent shuffle mask.
	int k = i % LaneSize;
	if (InLaneMask[k] < 0) {
	InLaneMask[k] = Mask[i] % LaneSize;
	} else if (InLaneMask[k] != Mask[i] % LaneSize) {
	// This doesn't fit a repeating in-lane mask.
	return SDValue();
	}
	}

	// First shuffle the lanes into place.
	MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
	VT.getSizeInBits() / 64);
	SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
	for (int i = 0; i < NumLanes; ++i)
	if (Lanes[i] >= 0) {
	LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
	LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
	}

	V1 = DAG.getBitcast(LaneVT, V1);
	V2 = DAG.getBitcast(LaneVT, V2);
	SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

	// Cast it back to the type we actually want.
	LaneShuffle = DAG.getBitcast(VT, LaneShuffle);

	// Now do a simple shuffle that isn't lane crossing.
	SmallVector<int, 8> NewMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
	assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
	"Must not introduce lane crosses at this point!");

	return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
	bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
	if (!UndefLower && !UndefUpper)
	return SDValue();

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	if (UndefUpper &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	// If the shuffle only uses two of the four halves of the input operands,
	// then extract them and perform the 'half' shuffle at half width.
	// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
	int HalfIdx1 = -1, HalfIdx2 = -1;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + Offset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return SDValue();
	}
	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	int NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	int NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);

	// uuuuXXXX - don't extract uppers just to insert again.
	if (UndefLower && NumUpperHalves != 0)
	return SDValue();

	// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
	if (UndefUpper && NumUpperHalves == 2)
	return SDValue();

	// AVX2 - XXXXuuuu - always extract lowers.
	if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
	// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();
	// AVX2 supports variable 32-bit element cross-lane shuffles.
	if (VT == MVT::v8f32 \|\| VT == MVT::v8i32) {
	// XXXXuuuu - don't extract lowers and uppers.
	if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
	return SDValue();
	}
	}

	// AVX512 - XXXXuuuu - always extract lowers.
	if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
	return SDValue();

	auto GetHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	SDValue Half1 = GetHalfVector(HalfIdx1);
	SDValue Half2 = GetHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// \brief Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm,
	ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V =
	lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
	Mask, DAG, Subtarget);

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
	DAG, Subtarget);

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return V;
	if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Shuf128;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return Unpck;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// FIXME: Implement direct support for this type!
	return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast =
	lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");
	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	ExtVT = MVT::v16i32;
	break;
	case MVT::v32i1:
	ExtVT = MVT::v32i16;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	if (ISD::isBuildVectorAllZeros(V1.getNode()))
	V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V1.getNode()))
	V1 = getOnesVector(ExtVT, DAG, DL);
	else
	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

	if (V2.isUndef())
	V2 = DAG.getUNDEF(ExtVT);
	else if (ISD::isBuildVectorAllZeros(V2.getNode()))
	V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V2.getNode()))
	V2 = getOnesVector(ExtVT, DAG, DL);
	else
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef)
	for (int M : Mask)
	if (M >= NumElements) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(Mask, WidenedMask)) {
	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is256BitVector())
	return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is512BitVector())
	return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (Is1BitVector)
	return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// \brief Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();
	auto *CondBV = cast<BuildVectorSDNode>(Cond);

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
	SDValue CondElt = CondBV->getOperand(i);
	Mask.push_back(
	isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
	: -1);
	}
	return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	SDValue Cond = Op.getOperand(0);
	// The vNi1 condition case should be handled above as it can be trivially
	// lowered.
	assert(Cond.getValueType().getScalarSizeInBits() ==
	VT.getScalarSizeInBits() &&
	"Should have a size-matched integer condition!");
	// Build a mask by testing the condition against itself (tests for zero).
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16:
	// FIXME: We should custom lower this by fixing the condition and using i8
	// blends.
	return SDValue();
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	// Canonicalize result type to MVT::i32.
	if (EltVT != MVT::i32) {
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	Vec, Idx);
	return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// Extracts from element 0 are always allowed.
	if (IdxVal == 0)
	return Op;

	// If the kshift instructions of the correct width aren't natively supported
	// then we need to promote the vector to the native size to get the correct
	// zeroing behavior.
	if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) \|\|
	(VecVT.getVectorNumElements() < 8)) {
	VecVT = MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
	DAG.getUNDEF(VecVT),
	Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getConstant(IdxVal, dl, MVT::i32));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	unsigned NumElems = VecVT.getVectorNumElements();

	// If the kshift instructions of the correct width aren't natively supported
	// then we need to promote the vector to the native size to get the correct
	// zeroing behavior.
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| (NumElems < 8)) {
	// Need to promote to v16i1, do the insert, then extract back.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), Vec,
	DAG.getIntPtrConstant(0, dl));
	Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);

	if (Vec.isUndef()) {
	if (IdxVal)
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return EltInVec;
	}

	// Insertion of one bit into first position
	if (IdxVal == 0 ) {
	// Clean top bits of vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Clean the first bit in source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}
	// Insertion of one bit into last position
	if (IdxVal == NumElems - 1) {
	// Move the bit to the last position inside the vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Clean the last bit in the source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}

	// Move the current value of the bit to be replace to bit 0.
	SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
	// Shift to MSB, filling bottom bits with 0.
	Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
	DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
	// Xor with original vector to cancel out the original bit value that's still
	// present.
	return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);
	if (!isa<ConstantSDNode>(N2))
	return SDValue();
	auto *N2C = cast<ConstantSDNode>(N2);
	unsigned IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getConstant(IdxIn128, dl, MVT::i32));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && "Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue
	X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
	const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);

	SDLoc DL(Op);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isPositionIndependent() && !Subtarget.is64Bit()) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	// For symbols that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlag))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
	const SDLoc &dl, int64_t Offset,
	SelectionDAG &DAG) const {
	// Create the TargetGlobalAddress node, folding in the constant
	// offset if it is legal.
	unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	// A direct static reference to a global.
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
	Offset = 0;
	} else {
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
	}

	Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlags))
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
	return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium() \|\|
	Subtarget.isTargetWindowsGNU()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
	// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	AndNode, DAG.getConstant(0, dl, MVT::i8));

	SDValue Hi, Lo;
	SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
	SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	} else {
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	}

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT == MVT::v2i1) {
	// For v2i1, we need to widen to v4i1 first.
	assert(VT == MVT::v2f64 && "Unexpected type");
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
	DAG.getUNDEF(MVT::v2i1));
	return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	Subtarget.is64Bit()) {
	return Op;
	}

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits()/8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *MMO;
	if (FI) {
	int SSFI = FI->getIndex();
	MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
	SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
	X86ISD::FILD, DL,
	Tys, Ops, SrcVT, MMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is flagged to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {
	Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
	};
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
	Ops, Op.getValueType(), MMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
	DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT == MVT::v2i1) {
	// For v2i1, we need to widen to v4i1 first.
	assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
	DAG.getUNDEF(MVT::v2i1));
	return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
	}

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	assert(!Subtarget.hasAVX512());
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an <SDValue(), SDValue()> pair.
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence.
	// If lowered to the final integer result we return a <result, SDValue()> pair.
	// Otherwise we lower it to a sequence ending with a FIST, return a
	// <FIST, StackSlot> pair, and the caller is responsible for loading
	// the final integer result from StackSlot.
	std::pair<SDValue,SDValue>
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, bool IsReplace) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return std::make_pair(SDValue(), SDValue());
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned &&
	DstTy == MVT::i64 &&
	(!Subtarget.is64Bit() \|\|
	!isScalarFPTypeInSSEReg(TheVT));

	if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// These are really Legal.
	if (DstTy == MVT::i32 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());
	if (Subtarget.is64Bit() &&
	DstTy == MVT::i64 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	unsigned Opc;
	switch (DstTy.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
	case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
	case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
	case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
	}

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(0x80000000, DL, MVT::i32));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI));
	SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
	SDValue Ops[] = {
	Chain, StackSlot, DAG.getValueType(TheVT)
	};

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOLoad, MemSize, MemSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
	Chain = Value.getValue(1);
	SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, MemSize, MemSize);

	if (UnsignedFixup) {

	// Insert the FIST, load its result as two i32's,
	// and XOR the high i32 with Adjust.

	SDValue FistOps[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	FistOps, DstTy, MMO);

	SDValue Low32 =
	DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
	SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);

	SDValue High32 =
	DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
	High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

	if (Subtarget.is64Bit()) {
	// Join High32 and Low32 into a 64-bit result.
	// (High32 << 32) \| Low32
	Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
	High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
	High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
	DAG.getConstant(32, DL, MVT::i8));
	SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
	return std::make_pair(Result, SDValue());
	}

	SDValue ResultOps[] = { Low32, High32 };

	SDValue pair = IsReplace
	? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
	: DAG.getMergeValues(ResultOps, DL);
	return std::make_pair(pair, SDValue());
	} else {
	// Build the FP_TO_INT*_IN_MEM
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);
	return std::make_pair(FIST, StackSlot);
	}
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i32) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v32i16 \|\| InVT != MVT::v32i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
	SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

	MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements()/2);

	OpLo = DAG.getBitcast(HVT, OpLo);
	OpHi = DAG.getBitcast(HVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is v8/v16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() &&
	(VT.getVectorElementType().getSizeInBits() <= 16))
	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	assert(!Op.getSimpleValueType().is256BitVector() \|\| !SVT.is128BitVector() \|\|
	Op.getSimpleValueType().getVectorNumElements() !=
	SVT.getVectorNumElements());
	return SDValue();
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");

	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 128bits or greater from a
	// 256bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 128) != 0 \|\| (SrcSizeInBits % 256) != 0)
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	unsigned NumElems = SrcVT.getVectorNumElements();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (DstVT.getScalarSizeInBits() > 8 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI
	if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
	return DAG.getNode(X86ISD::VTRUNC, DL, VT,
	getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
	}

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known;
	DAG.computeKnownBits(In, Known);
	NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
	if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	// Handle truncation of V256 to V128 using shuffles.
	if (!VT.is128BitVector() \|\| !InVT.is256BitVector())
	return SDValue();

	assert(Subtarget.hasFp256() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) {
	SDValue Src = Op.getOperand(0);
	SDLoc dl(Op);

	if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
	IsSigned, /IsReplace=/ false);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
	if (!FIST.getNode())
	return Op;

	if (StackSlot.getNode())
	// Load the result.
	return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());

	// The node is the result.
	return FIST;
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	MVT LogicVT;
	MVT EltVT;

	if (VT.isVector()) {
	LogicVT = VT;
	EltVT = VT.getVectorElementType();
	} else if (IsF128) {
	// SSE instructions are used for optimized f128 logical operations.
	LogicVT = MVT::f128;
	EltVT = VT;
	} else {
	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
	EltVT = VT;
	}

	unsigned EltBits = EltVT.getSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt =
	IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble() :
	(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp =
	IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	MVT EltVT = VT.getScalarType();
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble()
	: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41())
	return SDValue();

	if (!Op->hasOneUse())
	return SDValue();

	SDNode *N = Op.getNode();
	SDLoc DL(N);

	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, unsigned> VecInMap;
	SmallVector<SDValue, 8> VecIns;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	Opnds.push_back(N->getOperand(0));
	Opnds.push_back(N->getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all OR'd operands.
	if (I->getOpcode() == ISD::OR) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue ExtractedFromVec = I->getOperand(0);
	DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
	if (M == VecInMap.end()) {
	VT = ExtractedFromVec.getValueType();
	// Quit if not 128/256-bit vector.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	// Quit if not the same type.
	if (VecInMap.begin() != VecInMap.end() &&
	VT != VecInMap.begin()->first.getValueType())
	return SDValue();
	M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
	VecIns.push_back(ExtractedFromVec);
	}
	M->second \|= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
	}

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Not extracted from 128-/256-bit vector.");

	unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

	for (DenseMap<SDValue, unsigned>::const_iterator
	I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
	// Quit if not all elements are used.
	if (I->second != FullMask)
	return SDValue();
	}

	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// \brief return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Emit KTEST instruction for bit vectors on AVX-512
	static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Op.getOpcode() == ISD::BITCAST) {
	auto hasKTEST = [&](MVT VT) {
	unsigned SizeInBits = VT.getSizeInBits();
	return (Subtarget.hasDQI() && (SizeInBits == 8 \|\| SizeInBits == 16)) \|\|
	(Subtarget.hasBWI() && (SizeInBits == 32 \|\| SizeInBits == 64));
	};
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = Op0.getValueType().getSimpleVT();
	if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
	hasKTEST(Op0VT))
	return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
	}
	return SDValue();
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const {
	if (Op.getValueType() == MVT::i1) {
	SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
	DAG.getConstant(0, dl, MVT::i8));
	}
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	// Truncate operations may prevent the merge of the SETCC instruction
	// and the arithmetic instruction before it. Attempt to truncate the operands
	// of the arithmetic instruction and use a reduced bit-width instruction.
	bool NeedTruncation = false;
	SDValue ArithOp = Op;
	if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
	SDValue Arith = Op->getOperand(0);
	// Both the trunc and the arithmetic op need to have one user each.
	if (Arith->hasOneUse())
	switch (Arith.getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	NeedTruncation = true;
	ArithOp = Arith;
	}
	}
	}

	// Sometimes flags can be set either with an AND or with an SRL/SHL
	// instruction. SRL/SHL variant should be preferred for masks longer than this
	// number of bits.
	const int ShiftToAndMaxMaskWidth = 32;
	const bool ZeroCheck = (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE);

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::ADD:
	// We only want to rewrite this as a target-specific node with attached
	// flags if there is a reasonable chance of either using that to do custom
	// instructions selection that can fold some of the memory operands, or if
	// only the flags are used. If there are other uses, leave the node alone
	// and emit a test instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
	// An add of one will be selected as an INC.
	if (C->isOne() &&
	(!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	Opcode = X86ISD::INC;
	NumOperands = 1;
	break;
	}

	// An add of negative one (subtract of one) will be selected as a DEC.
	if (C->isAllOnesValue() &&
	(!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	Opcode = X86ISD::DEC;
	NumOperands = 1;
	break;
	}
	}

	// Otherwise use a regular EFLAGS-setting add.
	Opcode = X86ISD::ADD;
	NumOperands = 2;
	break;
	case ISD::SHL:
	case ISD::SRL:
	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if (ZeroCheck && Op->hasOneUse() &&
	isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	unsigned ShAmt = Op->getConstantOperandVal(1);
	if (ShAmt >= BitWidth) // Avoid undefined shifts.
	break;
	APInt Mask = ArithOp.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
	: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
	if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break;
	Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	}
	break;

	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better. However, AND should be
	// preferred if the instruction can be combined into ANDN.
	if (!hasNonFlagsUse(Op)) {
	SDValue Op0 = ArithOp->getOperand(0);
	SDValue Op1 = ArithOp->getOperand(1);
	EVT VT = ArithOp.getValueType();
	bool isAndn = isBitwiseNot(Op0) \|\| isBitwiseNot(Op1);
	bool isLegalAndnType = VT == MVT::i32 \|\| VT == MVT::i64;
	bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();

	// If we cannot select an ANDN instruction, check if we can replace
	// AND+IMM64 with a shift before giving up. This is possible for masks
	// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
	if (!isProperAndn) {
	if (!ZeroCheck)
	break;

	assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
	auto *CN = dyn_cast<ConstantSDNode>(Op1);
	if (!CN)
	break;

	const APInt &Mask = CN->getAPIntValue();
	if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break; // Prefer TEST instruction.

	unsigned BitWidth = Mask.getBitWidth();
	unsigned LeadingOnes = Mask.countLeadingOnes();
	unsigned TrailingZeros = Mask.countTrailingZeros();

	if (LeadingOnes + TrailingZeros == BitWidth) {
	assert(TrailingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
	break;
	}

	unsigned LeadingZeros = Mask.countLeadingZeros();
	unsigned TrailingOnes = Mask.countTrailingOnes();

	if (LeadingZeros + TrailingOnes == BitWidth) {
	assert(LeadingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
	break;
	}

	break;
	}
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Similar to ISD::ADD above, check if the uses will preclude useful
	// lowering of the target-specific node.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: {
	if (!NeedTruncation && ZeroCheck) {
	if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
	return EFLAGS;
	}
	Opcode = X86ISD::OR;
	break;
	}
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	// If we found that truncation is beneficial, perform the truncation and
	// update 'Op'.
	if (NeedTruncation) {
	EVT VT = Op.getValueType();
	SDValue WideVal = Op->getOperand(0);
	EVT WideVT = WideVal.getValueType();
	unsigned ConvertedOp = 0;
	// Use a target machine opcode to prevent further DAGCombine
	// optimizations that may separate the arithmetic operations
	// from the setcc node.
	switch (WideVal.getOpcode()) {
	default: break;
	case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
	case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
	case ISD::AND: ConvertedOp = X86ISD::AND; break;
	case ISD::OR: ConvertedOp = X86ISD::OR; break;
	case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
	}

	if (ConvertedOp) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
	SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
	SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
	Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
	}
	}
	}

	if (Opcode == 0) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;

	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesWith(Op, New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG);

	assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
	"Unexpected comparison operation for MVT::i1 operands");

	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if ((Op0.getValueType() == MVT::i16 &&
	(isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))) &&
	!DAG.getMachineFunction().getFunction().optForMinSize() &&
	!Subtarget.isAtom()) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return SDValue(Sub.getNode(), 1);
	}
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
	/// according to equal/not-equal condition code \p CC.
	static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	return getSETCC(Cond, BT, dl , DAG);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue LHS, RHS;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known;
	DAG.computeKnownBits(Op0, Known);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	LHS = Op1;
	RHS = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	LHS = AndLHS.getOperand(0);
	RHS = AndLHS.getOperand(1);
	}

	// Use BT if the immediate can't be encoded in a TEST instruction.
	if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
	LHS = AndLHS;
	RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
	}
	}

	if (LHS.getNode())
	return getBitTestCondition(LHS, RHS, CC, dl, DAG);

	return SDValue();
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected type for boolean compare operation");
	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
	DAG.getConstant(-1, dl, VT));
	SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
	DAG.getConstant(-1, dl, VT));
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETEQ:
	// (x == y) -> ~(x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT,
	DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
	DAG.getConstant(-1, dl, VT));
	case ISD::SETNE:
	// (x != y) -> (x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
	case ISD::SETUGT:
	case ISD::SETGT:
	// (x > y) -> (x & ~y)
	return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
	case ISD::SETULT:
	case ISD::SETLT:
	// (x < y) -> (~x & y)
	return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
	case ISD::SETULE:
	case ISD::SETLE:
	// (x <= y) -> (~x \| y)
	return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
	case ISD::SETUGE:
	case ISD::SETGE:
	// (x >=y) -> (x \| ~y)
	return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
	}
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	unsigned Opc = 0;
	bool Unsigned = false;
	bool Swap = false;
	unsigned SSECC;
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
	case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
	case ISD::SETULT: SSECC = 1; Unsigned = true; break;
	case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
	case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
	case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
	case ISD::SETLE: SSECC = 2; break;
	}

	if (Swap)
	std::swap(Op0, Op1);

	// See if it is the case of CMP(EQ\|NEQ,AND(A,B),ZERO) and change it to TESTM\|NM.
	if ((!Opc && SSECC == 4) \|\| Opc == X86ISD::PCMPEQM) {
	SDValue A = peekThroughBitcasts(Op0);
	if ((A.getOpcode() == ISD::AND \|\| A.getOpcode() == X86ISD::FAND) &&
	ISD::isBuildVectorAllZeros(Op1.getNode())) {
	MVT VT0 = Op0.getSimpleValueType();
	SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
	SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
	return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
	dl, VT, RHS, LHS);
	}
	}

	if (Opc)
	return DAG.getNode(Opc, dl, VT, Op0, Op1);
	Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
	/// operand \p Op1. If non-trivial (for example because it's not constant)
	/// return an empty value.
	static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
	SelectionDAG &DAG) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
	if (!BV)
	return SDValue();

	MVT VT = Op1.getSimpleValueType();
	MVT EVT = VT.getVectorElementType();
	unsigned n = VT.getVectorNumElements();
	SmallVector<SDValue, 8> ULTOp1;

	for (unsigned i = 0; i < n; ++i) {
	ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EVT)
	return SDValue();

	// Avoid underflow.
	APInt Val = Elt->getAPIntValue();
	if (Val == 0)
	return SDValue();

	ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
	}

	return DAG.getBuildVector(VT, dl, ULTOp1);
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC >= 8 && !Subtarget.hasAVX()) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	if (VT.is128BitVector() && VTOp0.is256BitVector()) {
	// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
	// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
	// legalizer firstly checks if the first operand in input to the setcc has
	// a legal type. If so, then it promotes the return type to that same type.
	// Otherwise, the return type is promoted to the 'next legal type' which,
	// for a vector of MVT::i1 is always a 128-bit integer vector type.
	//
	// We reach this code only if the following two conditions are met:
	// 1. Both return type and operand type have been promoted to wider types
	// by the type legalizer.
	// 2. The original operand type has been promoted to a 256-bit vector.
	//
	// Note that condition 2. only applies for AVX targets.
	SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
	return DAG.getZExtOrTrunc(NewOp, dl, VT);
	}

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// Operands are boolean (vectors of i1)
	MVT OpVT = Op1.getSimpleValueType();
	if (OpVT.getVectorElementType() == MVT::i1)
	return LowerBoolVSETCC_AVX512(Op, DAG);

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	// In this case use SSE compare
	bool UseAVX512Inst =
	(OpVT.is512BitVector() \|\|
	OpVT.getScalarSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX()));

	if (UseAVX512Inst)
	return LowerIntVSETCC_AVX512(Op, DAG);

	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
	}

	// Lower using XOP integer comparisons.
	if ((VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v2i64) && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for SETULE/SETUGE
	MVT VET = VT.getVectorElementType();
	bool HasMinMax =
	(Subtarget.hasAVX512() && VET == MVT::i64) \|\|
	(Subtarget.hasSSE41() && (VET == MVT::i16 \|\| VET == MVT::i32)) \|\|
	(Subtarget.hasSSE2() && (VET == MVT::i8));
	bool MinMax = false;
	if (HasMinMax) {
	switch (Cond) {
	default: break;
	case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
	case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
	}

	if (MinMax)
	Swap = Invert = FlipSigns = false;
	}

	bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 \|\| VET == MVT::i16);
	bool Subus = false;
	if (!MinMax && HasSubus) {
	// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	// Op0 u<= Op1:
	// t = psubus Op0, Op1
	// pcmpeq t, <0..0>
	switch (Cond) {
	default: break;
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	break;
	if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
	Op1 = ULEOp1;
	Subus = true; Invert = false; Swap = false;
	}
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
	case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
	}

	if (Subus) {
	Opc = X86ISD::SUBUS;
	FlipSigns = false;
	}
	}

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
	} else {
	SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
	SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
	SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	if (MinMax)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	if (Subus)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	getZeroVector(VT, Subtarget, DAG, dl));

	return Result;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
	return NewSetCC;
	}

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {

	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
	if (!Invert)
	return Op0;

	CCode = X86::GetOppositeBranchCondition(CCode);
	return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (X86CC == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	return getSETCC(X86CC, EFLAGS, dl, DAG);
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\|
	Opc == X86ISD::INC \|\| Opc == X86ISD::DEC \|\| Opc == X86ISD::OR \|\|
	Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && (VT == MVT::f32 \|\| VT == MVT::f64)) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	unsigned SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.

	if (Subtarget.hasAVX() &&
	!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {

	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.

	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	+ // For v64i1 without 64-bit support we need to split and rejoin.
	+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	+ assert(Subtarget.hasBWI() && "Expected BWI to be legal");
	+ SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
	+ SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
	+ SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
	+ SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
	+ SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
	+ SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
	+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	+ }
	+
	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (VT == MVT::v4i1 \|\| VT == MVT::v2i1) {
	SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
	Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
	Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
	SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	return Res;
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Opc == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is v8/v16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
	} else {
	SDValue NegOne = getOnesVector(WideVT, DAG, dl);
	SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();
	assert(VT.getSizeInBits() == InVT.getSizeInBits());

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasInt256()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (VT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	}

	assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

	// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
	unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
	X86ISD::VSEXT : X86ISD::VZEXT;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// We should only get here for sign extend.
	assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
	"Unexpected opcode!");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	MVT CurrVT = InVT;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
	Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
	MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
	CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
	Curr = DAG.getBitcast(CurrVT, Curr);
	}

	SDValue SignExt = Curr;
	if (CurrVT != InVT) {
	unsigned SignExtShift =
	CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (CurrVT == VT)
	return SignExt;

	if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
	SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(31, dl, MVT::i8));
	SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
	return DAG.getBitcast(VT, Ext);
	}

	return SDValue();
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i32) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v32i16 \|\| InVT != MVT::v32i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	unsigned NumElems = InVT.getVectorNumElements();
	SDValue Undef = DAG.getUNDEF(InVT);

	SmallVector<int,8> ShufMask1(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask1[i] = i;

	SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);

	SmallVector<int,8> ShufMask2(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask2[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
	OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Lower truncating store. We need a special lowering to vXi1 vectors
	static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
	SDLoc dl(St);
	EVT MemVT = St->getMemoryVT();
	assert(St->isTruncatingStore() && "We only custom truncating store.");
	assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
	"Expected truncstore of i1 vector");

	SDValue Op = St->getValue();
	MVT OpVT = Op.getValueType().getSimpleVT();
	unsigned NumElts = OpVT.getVectorNumElements();
	if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) \|\|
	NumElts == 16) {
	// Truncate and store - everything is legal
	Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
	if (MemVT.getSizeInBits() < 8)
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}

	// A subset, assume that we have only AVX-512F
	if (NumElts <= 8) {
	if (NumElts < 8) {
	// Extend to 8-elts vector
	MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
	DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
	}
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
	Op = DAG.getBitcast(MVT::i8, Op);
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}
	// v32i8
	assert(OpVT == MVT::v32i8 && "Unexpected operand type");
	// Divide the vector into 2 parts and store each part separately
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
	SDValue BasePtr = St->getBasePtr();
	SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
	St->getMemOperand());
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(16, dl));
	Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);

	SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);

	SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
	BasePtrHi, St->getPointerInfo().getWithOffset(2),
	MinAlign(St->getAlignment(), 2U),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
	}

	static SDValue LowerExtended1BitVectorLoad(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
	"Expected i1 vector load");
	unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
	ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	MVT VT = Op.getValueType().getSimpleVT();
	unsigned NumElts = VT.getVectorNumElements();

	if ((Subtarget.hasBWI() && NumElts >= 32) \|\|
	(Subtarget.hasDQI() && NumElts < 16) \|\|
	NumElts == 16) {
	// Load and extend - everything is legal
	if (NumElts < 8) {
	SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	if (Subtarget.hasVLX()) {
	// Extract to v4i1/v2i1.
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
	DAG.getIntPtrConstant(0, dl));
	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
	}

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
	}

	if (NumElts <= 8) {
	// A subset, assume that we have only AVX-512F
	SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);

	if (NumElts == 8)
	return DAG.getNode(ExtOpcode, dl, VT, BitVec);

	if (Subtarget.hasVLX()) {
	// Extract to v4i1/v2i1.
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
	DAG.getIntPtrConstant(0, dl));
	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
	}

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(VT == MVT::v32i8 && "Unexpected extload type");

	SDValue BasePtr = Ld->getBasePtr();
	SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());

	SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);

	SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
	Ld->getPointerInfo().getWithOffset(2),
	MinAlign(Ld->getAlignment(), 2U),
	Ld->getMemOperand()->getFlags());

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	LoadLo.getValue(1), LoadHi.getValue(1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

	SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
	SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector sext loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector sext loads.");

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	if (MemVT.getScalarType() == MVT::i1)
	return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getSExtOrTrunc(Load, dl, RegVT);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegZize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegZize = 128;

	// If we don't have BWI we won't be able to create the shuffle needed for
	// v8i8->v8i64.
	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8)
	loadRegZize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegZize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	for (unsigned i = 0; i < NumLoads; ++i) {
	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Ld->getAlignment(), Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	// If we have SSE4.1, we can directly emit a VSEXT node.
	if (Subtarget.hasSSE41()) {
	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
	// lanes.
	assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
	"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");

	SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8) {
	SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	// Keep this in sync with LowerXALUO, otherwise we might create redundant
	// instructions that can't be removed afterwards (i.e. X86ISD::ADD and
	// X86ISD::INC).
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_UNE.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	Dest = FalseBB;
	}
	}
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitTest(Cond, X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();
	ConstantSDNode *ND;

	switch(Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version
	switch (Opc) {
	default: llvm_unreachable("Unknown target vector shift node");
	case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
	case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
	case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
	}

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +=================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +=================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16)) \| Yes \| zero-extend in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +=================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
	ShAmt = ShAmt.getOperand(0);
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
	DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// \brief Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
	// Mask should be extended
	Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
	MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
	}

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	if (MaskVT == MVT::v64i1) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	// MaskVT require < 64bit. Truncate mask (should succeed in any case),
	// and bitcast.
	MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
	return DAG.getBitcast(MaskVT,
	DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
	}

	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// \brief Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	switch (Op.getOpcode()) {
	default: break;
	case X86ISD::CMPM:
	case X86ISD::CMPM_RND:
	case X86ISD::CMPMU:
	case X86ISD::VPSHUFBITQMB:
	return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
	case X86ISD::VFPCLASS:
	return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS:
	case X86ISD::CVTPS2PH:
	// We can't use ISD::VSELECT here because it is not always "Legal"
	// for the destination type. For example vpmovqb require only AVX512
	// and vselect that can operate on byte element type require BWI
	OpcodeSelect = X86ISD::SELECT;
	break;
	}
	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// \brief Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_RND)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
	if (Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::OR, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (!isa<ConstantSDNode>(Rnd))
	return false;

	unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
	return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	case INTR_TYPE_2OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	case INTR_TYPE_3OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3));
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK_RM: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue RoundingMode;
	// We always add rounding mode to the Node.
	// If the rounding mode is not specified, we add the
	// "current direction" mode.
	if (Op.getNumOperands() == 4)
	RoundingMode =
	DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	else
	RoundingMode = Op.getOperand(4);
	assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	RoundingMode),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Rnd),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (!isRoundModeCurDirection(Sae))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src0 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	if (Op.getNumOperands() == 6) {
	SDValue Sae = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	Sae),
	Mask, Src0, Subtarget, DAG);
	}
	assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, Src0, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK:
	case INTR_TYPE_2OP_IMM8_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
	Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	// TODO: Intrinsics should have fast-math-flags to propagate.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (6 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 6)
	Rnd = Op.getOperand(5);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Imm = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Imm, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_IMM8_MASK:
	case INTR_TYPE_3OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_2OP_MASK : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_3OP_MASKZ:
	case VPERM_3OP_MASK:{
	MVT VT = Op.getSimpleValueType();
	// Src2 is the PassThru
	SDValue Src1 = Op.getOperand(1);
	// PassThru needs to be the same type as the destination in order
	// to pattern match correctly.
	SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == VPERM_3OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else
	PassThru = Src2;

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src1, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_MASK3:
	case FMA_OP_MASKZ:
	case FMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_SCALAR_MASK:
	case FMA_OP_SCALAR_MASK3:
	case FMA_OP_SCALAR_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
	Op.getValueType(), Src1, Src2,
	Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}

	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
	Op.getValueType(), Src1, Src2,
	Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case IFMA_OP_MASKZ:
	case IFMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;

	// set PassThru element
	if (IntrData->Type == IFMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	// Node we need to swizzle the operands to pass the multiply operands
	// first.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src3, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case TERLOG_OP_MASK:
	case TERLOG_OP_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
	SDValue Mask = Op.getOperand(5);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;
	// Set PassThru element.
	if (IntrData->Type == TERLOG_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Src4),
	Mask, PassThru, Subtarget, DAG);
	}
	case CVTPD2PS:
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
	DAG.getIntPtrConstant(0, dl));
	case CVTPD2PS_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	DAG.getIntPtrConstant(0, dl)),
	Mask, PassThru, Subtarget, DAG);
	}
	case FPCLASS: {
	// FPclass intrinsics with mask
	SDValue Src1 = Op.getOperand(1);
	MVT VT = Src1.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
	SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case CMP_MASK:
	case CMP_MASK_CC: {
	// Comparison intrinsics with masks.
	// Example of transformation:
	// (i8 (int_x86_avx512_mask_pcmpeq_q_128
	// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
	// (i8 (bitcast
	// (v8i1 (insert_subvector undef,
	// (v2i1 (and (PCMPEQM %a, %b),
	// (extract_subvector
	// (v8i1 (bitcast %mask)), 0))), 0))))
	MVT VT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue Cmp;
	if (IntrData->Type == CMP_MASK_CC) {
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);

	} else {
	assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2));
	}
	SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CmpMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else
	FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
	DAG.getIntPtrConstant(0, dl));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (isAllOnesConstant(Mask)) // return data as is
	return Op.getOperand(1);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	DataToCompress),
	Mask, PassThru, Subtarget, DAG);
	}
	case BROADCASTM: {
	SDValue Mask = Op.getOperand(1);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	Mask = DAG.getBitcast(MaskVT, Mask);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
	}
	case MASK_BINOP: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
	return DAG.getBitcast(VT, Res);
	}
	case FIXUPIMMS:
	case FIXUPIMMS_MASKZ:
	case FIXUPIMM:
	case FIXUPIMM_MASKZ:{
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMMS ) ?
	Src1 : getZeroVector(VT, Subtarget, DAG, dl);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	if (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMM_MASKZ)
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	}
	case CONVERT_TO_MASK: {
	MVT SrcVT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
	Op.getOperand(1));
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CvtMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(2),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(3),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps:
	// Operands intentionally swapped. Mask is last operand to intrinsic,
	// but second operand for node/instruction.
	return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(1));

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	bool IsTestPacked = false;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case Intrinsic::x86_avx512_kortestz_w:
	case Intrinsic::x86_avx512_kortestc_w: {
	X86::CondCode X86CC =
	(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_avx512_knot_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kandn_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	// Invert LHS for the not.
	LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
	DAG.getConstant(1, dl, MVT::v16i1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kxnor_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	// Invert result for the not.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
	DAG.getConstant(1, dl, MVT::v16i1));
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
	SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTRI;
	else
	Opcode = X86ISD::PCMPESTRI;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::x86_seh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.x86.seh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else // This function handles the SP or FP case.
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(VMask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	return SDValue(Res, 1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsic that return the value
	/// of the extended control register.
	static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the XCR register to
	// return.
	SDValue Chain =
	DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
	SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one..
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read performance monitor
	/// counters (x86_rdpmc).
	static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the performance counter
	// to read.
	SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
	N->getOperand(2));
	SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

	// Reads the content of a 64-bit performance counter and returns it in the
	// registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// The EAX register is loaded with the low-order 32 bits. The EDX register
	// is loaded with the supported high-order bits of the counter.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
	SDValue LO, HI;

	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	SDValue Chain = HI.getValue(1);

	if (Opcode == X86ISD::RDTSCP_DAG) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");

	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
	HI.getValue(2));
	// Explicitly store the content of ECX at the location passed in input
	// to the 'rdtscp' intrinsic.
	Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
	MachinePointerInfo());
	}

	if (Subtarget.is64Bit()) {
	// The EDX register is loaded with the high-order 32 bits of the MSR, and
	// the EAX register is loaded with the low-order 32 bits.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 2> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during ExpandISelPseudos in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	SDValue LwpIns =
	DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
	LwpIns.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC: {
	SmallVector<SDValue, 2> Results;
	getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Get Extended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;
	getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
	DAG.getConstant(-1, dl, MVT::i8));
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
	Op.getOperand(4), GenCF.getValue(1));
	SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
	Op.getOperand(5), MachinePointerInfo());
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Store };
	return DAG.getMergeValues(Results, dl);
	}
	case COMPRESS_TO_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToCompress = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = DataToCompress.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // return just a store
	return DAG.getStore(Chain, dl, DataToCompress, Addr,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
	MemIntr->getMemOperand(),
	false /* truncating /, true / compressing */);
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}

	case EXPAND_FROM_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = Op.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
	return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
	if (X86::isZeroNode(Mask))
	return DAG.getUNDEF(VT);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
	MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
	true /* expanding */);
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /Offset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (16 < NumElems)
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);

	SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI())
	return LowerVectorCTLZ_AVX512CDI(Op, DAG);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDLoc dl(Op);

	if (VT.isVector()) {
	SDValue N0 = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);

	// lsb(x) = (x & -x)
	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
	DAG.getNode(ISD::SUB, dl, VT, Zero, N0));

	// cttz_undef(x) = (width - 1) - ctlz(lsb)
	if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
	SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
	return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
	DAG.getNode(ISD::CTLZ, dl, VT, LSB));
	}

	// cttz(x) = ctpop(lsb - 1)
	SDValue One = DAG.getConstant(1, dl, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT,
	DAG.getNode(ISD::SUB, dl, VT, LSB, One));
	}

	assert(Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	if (Subtarget.hasInt256()) {
	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// For 256-bit vectors, split into 128-bit vectors to allow the
	// sign-extension to occur. We don't need this on AVX512BW as we can
	// safely sign-extend to v32i16.
	if (VT == MVT::v32i8 && !Subtarget.hasBWI())
	return Lower256IntArith(Op, DAG);

	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and sign extend to i16
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
	BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and sign extend to i16
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
	BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// If the upper 17 bits of each element are zero then we can use PMADD.
	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
	return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
	DAG.getBitcast(MVT::v8i16, A),
	DAG.getBitcast(MVT::v8i16, B));

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");

	// 32-bit vector types used for MULDQ/MULUDQ.
	MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
	DAG.ComputeNumSignBits(B) > 32) {
	return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B));
	}

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
	bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
	bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

	// If DQI is supported we can use MULLQ, but MULUDQ is still better if the
	// the high bits are known to be zero.
	if (Subtarget.hasDQI() && (!AHiIsZero \|\| !BHiIsZero))
	return Op;

	// Bit cast to 32-bit vectors for MULUDQ.
	SDValue Alo = DAG.getBitcast(MulVT, A);
	SDValue Blo = DAG.getBitcast(MulVT, B);

	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	Bhi = DAG.getBitcast(MulVT, Bhi);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	Ahi = DAG.getBitcast(MulVT, Ahi);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned Opcode = Op.getOpcode();
	unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
	unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);

	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// AVX2 implementations - extend xmm subvectors to ymm.
	if (Subtarget.hasInt256()) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue Lo = DAG.getIntPtrConstant(0, dl);
	SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);

	if (VT == MVT::v32i8) {
	if (Subtarget.hasBWI()) {
	SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
	Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
	DAG.getConstant(8, dl, MVT::v32i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}
	SDValue ALo = extract128BitVector(A, 0, DAG, dl);
	SDValue BLo = extract128BitVector(B, 0, DAG, dl);
	SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
	SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
	ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
	BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
	AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
	BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
	Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
	DAG.getConstant(8, dl, MVT::v16i16));
	Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
	DAG.getConstant(8, dl, MVT::v16i16));
	// The ymm variant of PACKUS treats the 128-bit lanes separately, so before
	// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
	const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
	16, 17, 18, 19, 20, 21, 22, 23};
	const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	24, 25, 26, 27, 28, 29, 30, 31};
	return DAG.getNode(X86ISD::PACKUS, dl, VT,
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
	}

	assert(VT == MVT::v16i8 && "Unexpected VT");

	SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
	Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
	DAG.getConstant(8, dl, MVT::v16i16));
	// If we have BWI we can use truncate instruction.
	if (Subtarget.hasBWI())
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;
	unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);

	// Extract the lo parts and zero/sign extend to i16.
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
	BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and zero/sign extend to i16.
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
	BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to v16i8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
	RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
	MVT VT = Op0.getSimpleValueType();
	SDLoc dl(Op);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
	SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
	SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
	SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
	SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
	SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
	SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
	SDValue Ops[] = {
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
	};
	return DAG.getMergeValues(Ops, dl);
	}

	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	int NumElts = VT.getVectorNumElements();

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
	unsigned Opcode =
	(!IsSigned \|\| !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

	// Shuffle it back into the right order.
	SmallVector<int, 16> HighMask(NumElts);
	SmallVector<int, 16> LowMask(NumElts);
	for (int i = 0; i != NumElts; ++i) {
	HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
	LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
	}

	SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

	// If we have a signed multiply but no PMULDQ fix up the high parts of a
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue ShAmt = DAG.getConstant(
	31, dl,
	DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
	}

	// The first result of MUL_LOHI is actually the low value, followed by the
	// high value.
	SDValue Ops[] = {Lows, Highs};
	return DAG.getMergeValues(Ops, dl);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT,
	getZeroVector(VT, Subtarget, DAG, dl), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SHL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
	uint64_t ShiftAmt = ShiftConst->getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\|
	(Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SHL,
	DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
	if (!Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v8i64))) {

	// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
	unsigned SubVectorScale = 1;
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	SubVectorScale =
	Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
	Amt = Amt.getOperand(0);
	}

	// Peek through any splat that was introduced for i64 shift vectorization.
	int SplatIndex = -1;
	if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
	if (SVN->isSplat()) {
	SplatIndex = SVN->getSplatIndex();
	Amt = Amt.getOperand(0);
	assert(SplatIndex < (int)VT.getVectorNumElements() &&
	"Splat shuffle referencing second operand");
	}

	if (Amt.getOpcode() != ISD::BITCAST \|\|
	Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	(SubVectorScale * VT.getVectorNumElements());
	unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
	uint64_t ShiftAmt = 0;
	unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
	for (unsigned i = 0; i != Ratio; ++i) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShiftAmt \|= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
	}

	// Check remaining shift amounts (if not a splat).
	if (SplatIndex < 0) {
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	uint64_t ShAmt = 0;
	for (unsigned j = 0; j != Ratio; ++j) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShAmt \|= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
	}
	if (ShAmt != ShiftAmt)
	return SDValue();
	}
	}

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	if (Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
	SDValue BaseShAmt;
	MVT EltVT = VT.getVectorElementType();

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
	// Check if this build_vector node is doing a splat.
	// If so, then set BaseShAmt equal to the splat value.
	BaseShAmt = BV->getSplatValue();
	if (BaseShAmt && BaseShAmt.isUndef())
	BaseShAmt = SDValue();
	} else {
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	Amt = Amt.getOperand(0);

	ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
	if (SVN && SVN->isSplat()) {
	unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
	SDValue InVec = Amt.getOperand(0);
	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
	assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
	"Unexpected shuffle index found!");
	BaseShAmt = InVec.getOperand(SplatIdx);
	} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
	if (C->getZExtValue() == SplatIdx)
	BaseShAmt = InVec.getOperand(1);
	}
	}

	if (!BaseShAmt)
	// Avoid introducing an extract element from a shuffle.
	BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
	DAG.getIntPtrConstant(SplatIdx, dl));
	}
	}

	if (BaseShAmt.getNode()) {
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	VT.getVectorNumElements();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SRA) {
	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Op.getOpcode() == ISD::SHL \|\| Op.getOpcode() == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Op.getOpcode() == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Op.getOpcode() == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	// Do this only if the vector shift count is a constant build_vector.
	if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
	(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16))) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i=0; i !=NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	SDValue BV = DAG.getBuildVector(VT, dl, Elts);
	return DAG.getNode(ISD::MUL, dl, VT, R, BV);
	}

	// Lower SHL with variable shift amount.
	if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
	Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

	Op = DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getConstant(0x3f800000U, dl, VT));
	Op = DAG.getBitcast(MVT::v4f32, Op);
	Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
	return DAG.getNode(ISD::MUL, dl, VT, Op, R);
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
	// the vector shift into four scalar shifts plus four pairs of vector
	// insert/extract.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {
	bool UseMOVSD = false;
	bool CanBeSimplified;
	// The splat value for the first packed shift (the 'X' from the example).
	SDValue Amt1 = Amt->getOperand(0);
	// The splat value for the second packed shift (the 'Y' from the example).
	SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);

	// See if it is possible to replace this node with a sequence of
	// two shifts followed by a MOVSS/MOVSD/PBLEND.
	if (VT == MVT::v4i32) {
	// Check if it is legal to use a MOVSS.
	CanBeSimplified = Amt2 == Amt->getOperand(2) &&
	Amt2 == Amt->getOperand(3);
	if (!CanBeSimplified) {
	// Otherwise, check if we can still simplify this node using a MOVSD.
	CanBeSimplified = Amt1 == Amt->getOperand(1) &&
	Amt->getOperand(2) == Amt->getOperand(3);
	UseMOVSD = true;
	Amt2 = Amt->getOperand(2);
	}
	} else {
	// Do similar checks for the case where the machine value type
	// is MVT::v8i16.
	CanBeSimplified = Amt1 == Amt->getOperand(1);
	for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
	CanBeSimplified = Amt2 == Amt->getOperand(i);

	if (!CanBeSimplified) {
	UseMOVSD = true;
	CanBeSimplified = true;
	Amt2 = Amt->getOperand(4);
	for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
	CanBeSimplified = Amt1 == Amt->getOperand(i);
	for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
	CanBeSimplified = Amt2 == Amt->getOperand(j);
	}
	}

	if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
	isa<ConstantSDNode>(Amt2)) {
	// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
	SDValue Splat1 =
	DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
	SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
	SDValue Splat2 =
	DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
	SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
	SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
	SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
	if (UseMOVSD)
	return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
	BitCast2, {0, 1, 6, 7}));
	return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
	BitCast2, {0, 5, 6, 7}));
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	unsigned Opc = Op.getOpcode();
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// ISD::SHL is handled above but we include it here for completeness.
	switch (Opc) {
	default:
	llvm_unreachable("Unknown target vector shift node");
	case ISD::SHL:
	Opc = X86ISD::VSHL;
	break;
	case ISD::SRL:
	Opc = X86ISD::VSRL;
	break;
	case ISD::SRA:
	Opc = X86ISD::VSRA;
	break;
	}
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. These shuffle masks
	// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	}

	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
	SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v32i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc =
	Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
	unsigned ShiftOpcode = Op->getOpcode();

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	if (Op->getOpcode() == ISD::SHL \|\| Op->getOpcode() == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M =
	DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Op->getOpcode() == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(4, dl, ExtVT));
	SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(4, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(2, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(2, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(1, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(1, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte
	// meaning that we can safely pack with PACKUSWB.
	RLo =
	DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
	RHi =
	DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
	Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
	Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	unsigned ShiftOpcode = Op->getOpcode();

	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
	} else {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into smaller 128-bit shifts.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (Subtarget.hasAVX512()) {
	// Attempt to rotate by immediate.
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
	if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
	return EltBits[0] == V;
	})) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert(VT.isVector() && "Custom lowering only for vector rotates!");
	assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

	// Split 256-bit integers.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
	uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
	assert(RotateAmt < EltSizeInBits && "Rotation out of range");
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDNode *N = Op.getNode();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	unsigned BaseOp = 0;
	X86::CondCode Cond;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	// A subtract of one will be selected as a INC. Note that INC doesn't
	// set CF, so we can't do this for UADDO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::INC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_B;
	break;
	case ISD::SSUBO:
	// A subtract of one will be selected as a DEC. Note that DEC doesn't
	// set CF, so we can't do this for USUBO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::DEC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
	if (N->getValueType(0) == MVT::i8) {
	BaseOp = X86ISD::UMUL8;
	Cond = X86::COND_O;
	break;
	}
	SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
	MVT::i32);
	SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}
	}

	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
	SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
	else if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();
	else
	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return needsCmpXchgNb(SI->getValueOperand()->getType());
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	auto PTy = cast<PointerType>(LI->getPointerOperandType());
	return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
	auto Ptr = AI->getPointerOperand();

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, dl, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(0, dl, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i32), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
	return SDValue();
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SrcVT = Op.getOperand(0).getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64)
	// This conversion needs to be expanded.
	return SDValue();

	SDValue Op0 = Op->getOperand(0);
	SmallVector<SDValue, 16> Elts;
	SDLoc dl(Op);
	unsigned NumElts;
	MVT SVT;
	if (SrcVT.isVector()) {
	NumElts = SrcVT.getVectorNumElements();
	SVT = SrcVT.getVectorElementType();

	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
	DAG.getIntPtrConstant(i, dl)));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(0, dl)));
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(1, dl)));
	NumElts = 2;
	SVT = MVT::i32;
	}
	// Explicitly mark the extra elements as Undef.
	Elts.append(NumElts, DAG.getUNDEF(SVT));

	EVT NewVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
	SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
	SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned VecSize = VT.getSizeInBits();

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	//
	// To obtain the pop count for elements != i8, we follow up with the same
	// approach and use additional tricks as described below.
	//
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	int NumByteElts = VecSize / 8;
	MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
	SDValue In = DAG.getBitcast(ByteVecVT, Op);
	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumByteElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
	SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);

	// Low nibbles
	SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HighPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
	SDValue LowPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
	SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);

	if (EltVT == MVT::i8)
	return PopCnt;

	return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
	}

	static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitmath lowering supported.");

	int VecSize = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	int Len = EltVT.getSizeInBits();

	// This is the vectorized version of the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
	// with a minor tweak to use a series of adds + shifts instead of vector
	// multiplications. Implemented for all integer vector types. We only use
	// this when we don't have SSSE3 which allows a LUT-based lowering that is
	// much faster, even faster than using native popcnt instructions.

	auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
	MVT VT = V.getSimpleValueType();
	SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
	return DAG.getNode(OpCode, DL, VT, V, ShifterV);
	};
	auto GetMask = [&](SDValue V, APInt Mask) {
	MVT VT = V.getSimpleValueType();
	SDValue MaskV = DAG.getConstant(Mask, DL, VT);
	return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
	};

	// We don't want to incur the implicit masks required to SRL vNi8 vectors on
	// x86, so set the SRL type to have elements at least i16 wide. This is
	// correct because all of our SRLs are followed immediately by a mask anyways
	// that handles any bits that sneak into the high bits of the byte elements.
	MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);

	SDValue V = Op;

	// v = v - ((v >> 1) & 0x55555555...)
	SDValue Srl =
	DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
	SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
	V = DAG.getNode(ISD::SUB, DL, VT, V, And);

	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
	SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
	V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);

	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
	V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));

	// At this point, V contains the byte-wise population count, and we are
	// merely doing a horizontal sum if necessary to get the wider element
	// counts.
	if (EltVT == MVT::i8)
	return V;

	return LowerHorizontalByteSum(
	DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
	DAG);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems <= 16) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	if (!Subtarget.hasSSSE3()) {
	// We can't use the fast LUT approach, so fall back on vectorized bitmath.
	assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
	return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool AllowIncDec = true) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	// Convert to inc/dec if they aren't slow or we are optimizing for size.
	if (AllowIncDec && (!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	if ((NewOpc == X86ISD::LADD && C->isOne()) \|\|
	(NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
	return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	if ((NewOpc == X86ISD::LSUB && C->isOne()) \|\|
	(NewOpc == X86ISD::LADD && C->isAllOnesValue()))
	return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}
	}

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
	return SDValue();
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
	SDNode *Node = Op.getNode();
	SDLoc dl(Node);
	EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: On 32-bit, store -> fist or movq would be more efficient
	// (The only way to get a 16-byte store is cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	if (cast<AtomicSDNode>(Node)->getOrdering() ==
	AtomicOrdering::SequentiallyConsistent \|\|
	!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	return Swap.getValue(1);
	}
	// Other atomic stores have a simple pattern.
	return Op;
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();
	MVT MemVT = N->getMemoryVT().getSimpleVT();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
	// The v2i32 value was promoted to v2i64.
	// Now we "redo" the type legalizer's work and widen the original
	// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
	// with a shuffle.
	assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
	"Unexpected memory type");
	int ShuffleMask[] = {0, 2, -1, -1};
	Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
	DAG.getUNDEF(MVT::v4i32), ShuffleMask);
	// Now we have 4 elements instead of 2.
	// Expand the index.
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
	Index = ExtendToType(Index, NewIndexVT, DAG);

	// Expand the mask with zeroes
	// Mask may be <2 x i64> or <2 x i1> at this moment
	assert((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) &&
	"Unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	VT = MVT::v4i32;
	}

	unsigned NumElts = VT.getVectorNumElements();
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (IndexVT == MVT::v8i32)
	// Just extend index
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	else {
	// The minimal number of elts in scatter is 8
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	// Use original index here, do not modify the index twice
	Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	// Use the original mask here, do not modify the mask twice
	Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

	// The value that should be stored
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src = ExtendToType(Src, NewVT, DAG);
	}
	}
	// If the mask is "wide" at this point - truncate it to i1 vector
	MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);

	// The mask is killed by scatter, add it to the values
	SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
	// VLX. These types for exp-loads are handled here.
	if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	SDValue Src0 = N->getSrc0();
	Src0 = ExtendToType(Src0, WideDataVT, DAG);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, Src0,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
	if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Src0 = N->getValue();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (NumElts == 8) {
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
	}

	// Minimal number of elements in Gather
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	Index = ExtendToType(Index, NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);

	// The pass-through value
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src0 = ExtendToType(Src0, NewVT, DAG);

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewGather.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Extract, NewGather.getValue(2)};
	return DAG.getMergeValues(RetOps, dl);
	}

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return LowerADD_SUB(Op, DAG);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	assert((N->getNumValues() <= Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	// In some cases (LowerSINT_TO_FP for example) Res has more result values
	// than original node, chain should be dropped(last value).
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG by expanding vectors.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	auto InVT = N->getValueType(0);
	auto InVTSize = InVT.getSizeInBits();
	const unsigned RegSize =
	(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
	assert((Subtarget.hasBWI() \|\| RegSize < 512) &&
	"512-bit vector requires AVX512BW");
	assert((Subtarget.hasAVX2() \|\| RegSize < 256) &&
	"256-bit vector requires AVX2");

	auto ElemVT = InVT.getVectorElementType();
	auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
	RegSize / ElemVT.getSizeInBits());
	assert(RegSize % InVT.getSizeInBits() == 0);
	unsigned NumConcat = RegSize / InVT.getSizeInBits();

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

	SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

	if (N->getValueType(0) == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue Src = N->getOperand(0);
	if (Src.getValueType() == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
	: MVT::v2i32;
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	if (Src.getValueType() == MVT::v2f32) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	std::pair<SDValue,SDValue> Vals =
	FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /IsReplace=/ true);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	if (FIST.getNode()) {
	EVT VT = N->getValueType(0);
	// Return a load from the stack slot.
	if (StackSlot.getNode())
	Results.push_back(
	DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
	else
	Results.push_back(FIST);
	}
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

	case Intrinsic::x86_xgetbv:
	return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
	}
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
	Results.push_back(V);
	return;
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD: {
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	MVT::v2f64, N->getOperand(0));
	SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);

	if (ExperimentalVectorWideningLegalization) {
	// If we are legalizing vectors by widening, we already have the desired
	// legal vector type, just return it.
	Results.push_back(ToVecInt);
	return;
	}

	SmallVector<SDValue, 8> Elts;
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
	ToVecInt, DAG.getIntPtrConstant(i, dl)));

	Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if (VT == MVT::v2f32 && (Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	Gather->getValue(),
	DAG.getUNDEF(MVT::v2f32));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(2));
	return;
	}
	if (VT == MVT::v2i32) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
	Gather->getValue(),
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 we can use it directly.
	if (Index.getValueType() == MVT::v2i64 &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	SDValue Chain = Res.getValue(2);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	EVT IndexVT = Index.getValueType();
	EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
	IndexVT.getScalarType(), 4);
	// Otherwise we need to custom widen everything to avoid promotion.
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(IndexVT));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
	Gather->getMemoryVT(), dl, Ops,
	Gather->getMemOperand());
	SDValue Chain = Res.getValue(1);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	break;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
	case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
	case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
	case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
	case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPMU: return "X86ISD::CMPMU";
	case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
	case X86ISD::ADDUS: return "X86ISD::ADDUS";
	case X86ISD::SUBUS: return "X86ISD::SUBUS";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
	case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::LINC: return "X86ISD::LINC";
	case X86ISD::LDEC: return "X86ISD::LDEC";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VZEXT: return "X86ISD::VZEXT";
	case X86ISD::VSEXT: return "X86ISD::VSEXT";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
	case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
	case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
	case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::SMUL8: return "X86ISD::SMUL8";
	case X86ISD::UMUL8: return "X86ISD::UMUL8";
	case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
	case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
	case X86ISD::INC: return "X86ISD::INC";
	case X86ISD::DEC: return "X86ISD::DEC";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::TESTM: return "X86ISD::TESTM";
	case X86ISD::TESTNM: return "X86ISD::TESTNM";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::VSHLD: return "X86ISD::VSHLD";
	case X86ISD::VSHRD: return "X86ISD::VSHRD";
	case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
	case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
	case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
	case X86ISD::VRANGES: return "X86ISD::VRANGES";
	case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
	case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
	case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
	case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
	case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
	case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
	case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
	case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
	case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
	case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
	case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
	case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
	case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
	case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
	case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
	case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
	case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
	case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
	case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
	case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
	case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
	case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECT: return "X86ISD::SELECT";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP14: return "X86ISD::RCP14";
	case X86ISD::RCP14S: return "X86ISD::RCP14S";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
	case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
	case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::ADDS: return "X86ISD::ADDS";
	case X86ISD::SUBS: return "X86ISD::SUBS";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
	case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
	case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
	case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
	case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
	case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
	case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
	case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
	case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
	case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
	case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
	case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool
	X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
	EVT VT) const {
	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
	// or XMM0_V32I8 in AVX all of this code can be replaced with that
	// in the .td file.
	static MachineBasicBlock emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
	case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
	case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
	case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
	case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
	case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
	case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
	case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands();
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::XMM0);

	MI.eraseFromParent();
	return BB;
	}

	// FIXME: Custom handling because TableGen doesn't support multiple implicit
	// defs in an instruction pattern
	static MachineBasicBlock emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
	case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
	case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
	case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
	case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
	case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
	case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
	case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands(); // remove the results
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::ECX);

	MI.eraseFromParent();
	return BB;
	}

	static MachineBasicBlock emitWRPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert input VAL into EAX
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
	.addReg(MI.getOperand(0).getReg());
	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert zero to EDX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);

	// insert WRPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitRDPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert RDPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::EAX);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitMonitor(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget,
	unsigned Opc) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX, other two args into ECX, EDX.
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));

	unsigned ValOps = X86::AddrNumOperands;
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
	.addReg(MI.getOperand(ValOps).getReg());
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
	.addReg(MI.getOperand(ValOps + 1).getReg());

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(Opc));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitClzero(MachineInstr MI, MachineBasicBlock *BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI->getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI->getOperand(i));

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));

	MI->eraseFromParent(); // The pseudo is gone now.
	return BB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *MF = MBB->getParent();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
	.addMBB(overflowMBB);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
	BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
	BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	unsigned DestReg = FirstCMOV.getOperand(0).getReg();
	unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
	unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt =
	std::next(MachineBasicBlock::iterator(MI));

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	unsigned Opc = X86::GetCondBranchFromCond(CC);
	BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Combine the following atomic floating-point modification pattern:
	// a.store(reg OP a.load(acquire), release)
	// Transform them into:
	// OPss (%gpr), %xmm
	// movss %xmm, (%gpr)
	// Or sd equivalent for 64-bit operations.
	unsigned MOp, FOp;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
	case X86::RELEASE_FADD32mr:
	FOp = X86::ADDSSrm;
	MOp = X86::MOVSSmr;
	break;
	case X86::RELEASE_FADD64mr:
	FOp = X86::ADDSDrm;
	MOp = X86::MOVSDmr;
	break;
	}
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
	unsigned ValOpIdx = X86::AddrNumOperands;
	unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(FOp),
	MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
	.addReg(VSrc);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	MachineOperand &Operand = MI.getOperand(i);
	// Clear any kill flags on register operands as we'll create a second
	// instruction using the same address operands.
	if (Operand.isReg())
	Operand.setIsKill(false);
	MIB.add(Operand);
	}
	MachineInstr *FOpMI = MIB;
	MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction().getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	// Reload FP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload IP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), LabelOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload SP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Jump
	BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return MBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MFI.getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugValue())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);

	if (Subtarget.is64Bit()) {
	unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TAILJMPd64:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	llvm_unreachable("TAILJMP64 would not be touched here.");
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	return BB;
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_FR128:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the FLAGS register without it being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	Push->getOperand(2).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::RELEASE_FADD32mr:
	case X86::RELEASE_FADD64mr:
	return EmitLoweredAtomicFP(MI, BB);

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), CWFrameIdx);

	// Load the old value of the high byte of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
	CWFrameIdx);

	// Set the high part to be round to zero...
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
	.addImm(0xC7F);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	// Restore the memory image of control word to original value
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
	.addReg(OldCW);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}
	// String/text processing lowering.
	case X86::PCMPISTRM128REG:
	case X86::VPCMPISTRM128REG:
	case X86::PCMPISTRM128MEM:
	case X86::VPCMPISTRM128MEM:
	case X86::PCMPESTRM128REG:
	case X86::VPCMPESTRM128REG:
	case X86::PCMPESTRM128MEM:
	case X86::VPCMPESTRM128MEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());

	// String/text processing lowering.
	case X86::PCMPISTRIREG:
	case X86::VPCMPISTRIREG:
	case X86::PCMPISTRIMEM:
	case X86::VPCMPISTRIMEM:
	case X86::PCMPESTRIREG:
	case X86::VPCMPESTRIREG:
	case X86::PCMPESTRIMEM:
	case X86::VPCMPESTRIMEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());

	// Thread synchronization.
	case X86::MONITOR:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
	case X86::MONITORX:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);

	// Cache line zero
	case X86::CLZERO:
	return emitClzero(&MI, BB, Subtarget);

	// PKU feature
	case X86::WRPKRU:
	return emitWRPKRU(MI, BB, Subtarget);
	case X86::RDPKRU:
	return emitRDPKRU(MI, BB, Subtarget);
	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	// Do nothing here, handle in xray instrumentation pass.
	return BB;

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
	Known = Known.zextOrTrunc(BitWidth);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	}
	break;
	}
	case X86ISD::VZEXT: {
	// TODO: Add DemandedElts support.
	SDValue N0 = Op.getOperand(0);
	unsigned NumElts = VT.getVectorNumElements();

	EVT SrcVT = N0.getValueType();
	unsigned InNumElts = SrcVT.getVectorNumElements();
	unsigned InBitWidth = SrcVT.getScalarSizeInBits();
	assert(InNumElts >= NumElts && "Illegal VZEXT input");

	Known = KnownBits(InBitWidth);
	APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
	DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBitWidth);
	break;
	}
	case X86ISD::CMOV: {
	DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2;
	DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	case X86ISD::UDIVREM8_ZEXT_HREG:
	// TODO: Support more than just the zero extended bits?
	if (Op.getResNo() != 1)
	break;
	// The remainder is zero extended.
	Known.Zero.setBitsFrom(8);
	break;
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned VTBits = Op.getScalarValueSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VSEXT: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	Tmp += VTBits - Src.getScalarValueSizeInBits();
	return Tmp;
	}

	case X86ISD::VTRUNC: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcBits = Src.getScalarValueSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	// TODO: Add DemandedElts support.
	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	case X86ISD::SDIVREM8_SEXT_HREG:
	// TODO: Support more than just the sign extended bits?
	if (Op.getResNo() != 1)
	break;
	// The remainder is sign extended.
	return VTBits - 7;
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	bool X86TargetLowering::isGAPlusOffset(SDNode *N,
	const GlobalValue* &GA,
	int64_t &Offset) const {
	if (N->getOpcode() == X86ISD::Wrapper) {
	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
	GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
	Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
	return true;
	}
	}
	return TargetLowering::isGAPlusOffset(N, GA, Offset);
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool Match = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && Match; ++i) {
	Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
	Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (Match) {
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
	Shuffle = unsigned(X86ISD::VZEXT);
	} else
	Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	// Attempt to match against broadcast-from-vector.
	if (Subtarget.hasAVX2()) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	SrcVT = DstVT = MaskVT;
	Shuffle = X86ISD::VBROADCAST;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask<int>(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
	MaskScalarSizeInBits, Mask,
	0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVLHPS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVHLPS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MaskVT;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
	// TODO add support for 256/512-bit types.
	if ((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
	if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	// Determine a type compatible with X86ISD::BLENDI.
	ShuffleVT = MaskVT;
	if (Subtarget.hasAVX2()) {
	if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v8i32;
	else if (ShuffleVT == MVT::v2i64)
	ShuffleVT = MVT::v4i32;
	} else {
	if (ShuffleVT == MVT::v2i64 \|\| ShuffleVT == MVT::v4i32)
	ShuffleVT = MVT::v8i16;
	else if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v4f64;
	else if (ShuffleVT == MVT::v8i32)
	ShuffleVT = MVT::v8f32;
	}

	if (!ShuffleVT.isFloatingPoint()) {
	int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
	BlendMask =
	scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
	ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
	}

	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
	unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	return DAG.getBitcast(RootVT, V1);
	}
	}

	+ SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	- V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	- ShuffleVT) &&
	+ NewV1, DL, DAG, Subtarget, Shuffle,
	+ ShuffleSrcVT, ShuffleVT) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	- Res = DAG.getBitcast(ShuffleSrcVT, V1);
	+ Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle,
	ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	}

	+ SDValue NewV1 = V1; // Save operands in case early exit happens.
	+ SDValue NewV2 = V2;
	if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	- V1, V2, DL, DAG, Subtarget, Shuffle,
	+ NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
	ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	- V1 = DAG.getBitcast(ShuffleSrcVT, V1);
	- DCI.AddToWorklist(V1.getNode());
	- V2 = DAG.getBitcast(ShuffleSrcVT, V2);
	- DCI.AddToWorklist(V2.getNode());
	- Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
	+ NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	+ DCI.AddToWorklist(NewV1.getNode());
	+ NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	+ DCI.AddToWorklist(NewV2.getNode());
	+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	- if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	- AllowIntDomain, V1, V2, DL, DAG,
	- Subtarget, Shuffle, ShuffleVT,
	- PermuteImm) &&
	+ NewV1 = V1; // Save operands in case early exit happens.
	+ NewV2 = V2;
	+ if (matchBinaryPermuteVectorShuffle(
	+ MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
	+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	- V1 = DAG.getBitcast(ShuffleVT, V1);
	- DCI.AddToWorklist(V1.getNode());
	- V2 = DAG.getBitcast(ShuffleVT, V2);
	- DCI.AddToWorklist(V2.getNode());
	- Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
	+ NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	+ DCI.AddToWorklist(NewV1.getNode());
	+ NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	+ DCI.AddToWorklist(NewV2.getNode());
	+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(IntMaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
	bool AllowVariableMask = (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	DCI.AddToWorklist(Zero.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(BitMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPerm2MaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	DCI.AddToWorklist(PSHUFBMaskOp.getNode());
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ByteVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	DCI.AddToWorklist(VPPERMMaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(CstOp.getNode());
	return DAG.getBitcast(VT, CstOp);
	}

	/// \brief Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	if (Depth > 8)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return SDValue();

	assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
	SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
	SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	int InputIdx0 = -1, InputIdx1 = -1;
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	SDValue BC = peekThroughBitcasts(Ops[i]);
	if (Input0 && BC == peekThroughBitcasts(Input0))
	InputIdx0 = i;
	if (Input1 && BC == peekThroughBitcasts(Input1))
	InputIdx1 = i;
	}

	if (Input0 && InputIdx0 < 0) {
	InputIdx0 = SrcOpIndex;
	Ops[SrcOpIndex] = Input0;
	}
	if (Input1 && InputIdx1 < 0) {
	InputIdx1 = Ops.size();
	Ops.push_back(Input1);
	}

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	if (OpMask[OpIdx] < (int)OpMask.size()) {
	assert(0 <= InputIdx0 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx0 * MaskWidth;
	} else {
	assert(0 <= InputIdx1 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx1 * MaskWidth;
	}

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	// Remove unused shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be combined if it either has a
	// single use (i.e. current Op) or all its users have already been combined.
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
	DAG, DCI, Subtarget))
	return Res;

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() > 2)
	return SDValue();

	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
	DCI, Subtarget);
	}

	/// \brief Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
	/// pshufhw.
	///
	/// We walk up the chain, skipping shuffles of the other half and looking
	/// through shuffles which switch halves trying to find a shuffle of the same
	/// pair of dwords.
	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(
	(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);
	unsigned CombineOpcode = N.getOpcode();

	// Walk up a single-use chain looking for a combinable shuffle.
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return false; // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOpcode)
	break;

	// Other-half shuffles are no-ops.
	continue;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return false;

	// Combine away the bottom node as its shuffle will be accumulated into
	// a preceding shuffle.
	DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Record the old value.
	SDValue Old = V;

	// Merge this node's mask and our incoming mask (adjusted to account for all
	// the pshufd instructions encountered).
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Check that the shuffles didn't cancel each other out. If not, we need to
	// combine to the new one.
	if (Old != V)
	// Replace the combinable shuffle with the combined one, updating all users
	// so that we re-evaluate the chain here.
	DCI.CombineTo(Old.getNode(), V, /AddTo/ true);

	return true;
	}

	/// \brief Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction.
	if (VT.getScalarSizeInBits() == 64 &&
	(Opcode == X86ISD::MOVSD \|\| Opcode == X86ISD::UNPCKH \|\|
	Opcode == X86ISD::UNPCKL)) {
	auto BC0 = peekThroughBitcasts(N.getOperand(0));
	auto BC1 = peekThroughBitcasts(N.getOperand(1));
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB \|\|
	Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	SDValue Lo, Hi;
	if (Opcode == X86ISD::MOVSD) {
	Lo = BC1.getOperand(0);
	Hi = BC0.getOperand(1);
	} else {
	Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	DCI.AddToWorklist(Horiz.getNode());
	return DAG.getBitcast(VT, Horiz);
	}
	}

	switch (Opcode) {
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::UNPCKL: {
	// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
	// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
	// moves upper half elements into the lower half part. For example:
	//
	// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
	// undef:v16i8
	// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
	//
	// will be combined to:
	//
	// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1

	// This is only for 128-bit vectors. From SSE4.1 onward this combine may not
	// happen due to advanced instructions.
	if (!VT.is128BitVector())
	return SDValue();

	auto Op0 = N.getOperand(0);
	auto Op1 = N.getOperand(1);
	if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();

	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ExpectedMask(NumElts, -1);
	std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
	NumElts / 2);

	auto ShufOp = Op1.getOperand(0);
	if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
	}
	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue V0 = N->getOperand(0);
	SDValue V1 = N->getOperand(1);
	assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
	"Unexpected input vector types");

	// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
	// operands and changing the mask to 1. This saves us a bunch of
	// pattern-matching possibilities related to scalar math ops in SSE/AVX.
	// x86InstrInfo knows how to commute this back after instruction selection
	// if it would help register allocation.

	// TODO: If optimizing for size or a processor that doesn't suffer from
	// partial register update stalls, this should be transformed into a MOVSD
	// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.

	if (VT == MVT::v2f64)
	if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
	if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
	SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
	}

	return SDValue();
	}
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue V0 = peekThroughBitcasts(N->getOperand(0));
	SDValue V1 = peekThroughBitcasts(N->getOperand(1));
	bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
	bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
	if (isZero0 && isZero1)
	return SDValue();

	// We often lower to MOVSD/MOVSS from integer as well as native float
	// types; remove unnecessary domain-crossing bitcasts if we can to make it
	// easier to combine shuffles later on. We've already accounted for the
	// domain switching cost when we decided to lower with it.
	bool isFloat = VT.isFloatingPoint();
	bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
	bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
	if ((isFloat != isFloat0 \|\| isZero0) && (isFloat != isFloat1 \|\| isZero1)) {
	MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
	: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
	V0 = DAG.getBitcast(NewVT, V0);
	V1 = DAG.getBitcast(NewVT, V1);
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
	return SDValue(); // We combined away this shuffle, so we're done.

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	DCI.AddToWorklist(V.getNode());
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	DCI.AddToWorklist(V.getNode());
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	DCI.AddToWorklist(V.getNode());
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SDValue &Opnd0, SDValue &Opnd1,
	bool matchSubAdd = false) {

	EVT VT = N->getValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
	SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
	unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;

	// We require the first shuffle operand to be the ExpectedOpcode node,
	// and the second to be the NextExpectedOpcode node.
	if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	} else if (V1.getOpcode() != ExpectedOpcode \|\| V2.getOpcode() != NextExpectedOpcode)
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;

	// We're looking for blends between FADD and FSUB nodes. We insist on these
	// nodes being lined up in a specific expected pattern.
	if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
	8, 25, 10, 27, 12, 29, 14, 31})))
	return false;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// \brief Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// \brief Try to combine a shuffle into a target-specific
	/// mul-sub-add node.
	static SDValue combineShuffleToFMSubAdd(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMSUBADD node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
	return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);

	return SDValue();
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N) {
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	// TODO: 256-bit is not the same because...x86.
	if (HOp.getOperand(0) != HOp.getOperand(1) \|\| HOp.getValueSizeInBits() != 128)
	return SDValue();

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If the shuffle is also replicating
	// low and high halves, we don't need the shuffle.
	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (isTargetShuffleEquivalent(Mask, { 0, 0 }) \|\|
	isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) \|\|
	isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
	return HOp;

	return SDValue();
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
	return FMSubAdd;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N))
	return HAddSub;
	}

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
	// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
	// consecutive, non-overlapping, and in the right order.
	SmallVector<SDValue, 16> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	Elts.clear();
	break;
	}

	if (Elts.size() == VT.getVectorNumElements())
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	return SDValue();
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
	: ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
	const X86Subtarget &Subtarget) {
	EVT VT = BitCast.getValueType();
	SDValue N0 = BitCast.getOperand(0);
	EVT VecVT = N0->getValueType(0);

	if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
	N0->getOpcode() == ISD::OR) {
	SDValue Op0 = N0->getOperand(0);
	SDValue Op1 = N0->getOperand(1);
	MVT TrunckVT;
	MVT BitcastVT;
	switch (VT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v16i1:
	TrunckVT = MVT::i8;
	BitcastVT = MVT::v8i1;
	break;
	case MVT::v32i1:
	TrunckVT = MVT::i16;
	BitcastVT = MVT::v16i1;
	break;
	case MVT::v64i1:
	TrunckVT = MVT::i32;
	BitcastVT = MVT::v32i1;
	break;
	}
	bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
	bool isArg0UndefLeft =
	Op0->getOpcode() == ISD::ZERO_EXTEND \|\| Op0->getOpcode() == ISD::AND;
	bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
	bool isArg1UndefLeft =
	Op1->getOpcode() == ISD::ZERO_EXTEND \|\| Op1->getOpcode() == ISD::AND;
	SDValue OpLeft;
	SDValue OpRight;
	if (isArg0UndefRight && isArg1UndefLeft) {
	OpLeft = Op0;
	OpRight = Op1;
	} else if (isArg1UndefRight && isArg0UndefLeft) {
	OpLeft = Op1;
	OpRight = Op0;
	} else
	return SDValue();
	SDLoc DL(BitCast);
	SDValue Shr = OpLeft->getOperand(0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
	SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
	SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
	SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
	}

	if (!VT.isScalarInteger() \|\| !VecVT.isSimple())
	return SDValue();

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (Subtarget.hasAVX512() \|\| !Subtarget.hasSSE2())
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	switch (VecVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	FPCastVT = MVT::v2f64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	FPCastVT = MVT::v4f32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	N0->getOperand(0).getValueType().is256BitVector()) {
	SExtVT = MVT::v4i64;
	FPCastVT = MVT::v4f64;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	(N0->getOperand(0).getValueType().is256BitVector() \|\|
	N0->getOperand(0).getValueType().is512BitVector())) {
	SExtVT = MVT::v8i32;
	FPCastVT = MVT::v8f32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	};

	SDLoc DL(BitCast);
	SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);

	if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	// Handle pre-AVX2 cases by splitting to two v16i1's.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
	SDValue Lo = extract128BitVector(V, 0, DAG, DL);
	SDValue Hi = extract128BitVector(V, 16, DAG, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, ShiftTy));
	V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	if (SExtVT == MVT::v8i16) {
	assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	} else
	assert(SExtVT.getScalarType() != MVT::i16 &&
	"Vectors of i16 must be packed");
	if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	V = DAG.getBitcast(FPCastVT, V);
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasVLX()) {
	SDLoc dl(N);
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasVLX()) {
	SDLoc dl(N);
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.

	// Detect bitcasts between i32 to x86mmx low word.
	if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
	SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType() == MVT::i32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
	}

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if (VT == MVT::x86mmx &&
	(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
	N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Match a binop + shuffle pyramid that represents a horizontal reduction over
	// the elements of a vector.
	// Returns the vector that is being reduced on, or SDValue() if a reduction
	// was not matched.
	static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
	ArrayRef<ISD::NodeType> CandidateBinOps) {
	// The pattern must end in an extract from index 0.
	if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) \|\|
	!isNullConstant(Extract->getOperand(1)))
	return SDValue();

	SDValue Op = Extract->getOperand(0);
	unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());

	// Match against one of the candidate binary ops.
	if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
	return Op.getOpcode() == unsigned(BinOp);
	}))
	return SDValue();

	// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:
	// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>
	unsigned CandidateBinOp = Op.getOpcode();
	for (unsigned i = 0; i < Stages; ++i) {
	if (Op.getOpcode() != CandidateBinOp)
	return SDValue();

	ShuffleVectorSDNode *Shuffle =
	dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
	if (Shuffle) {
	Op = Op.getOperand(1);
	} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
	Op = Op.getOperand(0);
	}

	// The first operand of the shuffle should be the same as the other operand
	// of the binop.
	if (!Shuffle \|\| Shuffle->getOperand(0) != Op)
	return SDValue();

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
	return SDValue();
	}

	BinOp = CandidateBinOp;
	return Op;
	}

	// Given a select, detect the following pattern:
	// 1: %2 = zext <N x i8> %0 to <N x i32>
	// 2: %3 = zext <N x i8> %1 to <N x i32>
	// 3: %4 = sub nsw <N x i32> %2, %3
	// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
	// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
	// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)
	return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;

	SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);

	// The following instructions assume SelectOp1 is the subtraction operand
	// and SelectOp2 is the negation operand.
	// In the case of SETLT this is the other way around.
	if (CC == ISD::SETLT)
	std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))
	return false;

	// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)
	return false;

	// In SetLT case, The second operand of the comparison can be either 1 or 0.
	APInt SplatVal;
	if ((CC == ISD::SETLT) &&
	!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
	SplatVal.isOneValue()) \|\|
	(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
	return false;

	// In SetGT case, The second operand of the comparison can be either -1 or 0.
	if ((CC == ISD::SETGT) &&
	!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;

	// The first operand of the select is the difference between the two input
	// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = SelectOp1.getOperand(0);
	Op1 = SelectOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL) {

	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	unsigned BinOp;
	SDValue Src = matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	unsigned NumElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = NumElts / 2;
	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
	unsigned SubSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
	SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2 or with AVX512VL (which uses predicate registers).
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasVLX())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8)
	return SDValue();

	// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
	unsigned BinOp = 0;
	SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	// We require AVX2 for PMOVMSKB for v16i16/v32i8;
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 &&
	((Subtarget.hasAVX() && BitWidth >= 32) \|\| Subtarget.hasAVX2()))))
	return SDValue();

	// Don't bother performing this for 2-element vectors.
	if (Match.getValueType().getVectorNumElements() <= 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	APInt CompareBits;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CompareBits = APInt::getNullValue(32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
	CondCode = ISD::CondCode::SETEQ;
	}

	// Perform the select as i32/i64 and then truncate to avoid partial register
	// stalls.
	unsigned ResWidth = std::max(BitWidth, 32u);
	EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
	SDLoc DL(Extract);
	SDValue Zero = DAG.getConstant(0, DL, ResVT);
	SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
	SDValue Res = DAG.getBitcast(MaskVT, Match);
	Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
	Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
	Ones, Zero, CondCode);
	return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	unsigned BinOp = 0;
	SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
	if (X86ISD::VBROADCAST == Src.getOpcode() &&
	Src.getOperand(0).getValueType() == VT)
	return Src.getOperand(0);

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask<int>(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
	isa<ConstantSDNode>(EltIdx) &&
	isa<ConstantSDNode>(InputVector.getOperand(0))) {
	uint64_t ExtractedElt = N->getConstantOperandVal(1);
	uint64_t InputValue = InputVector.getConstantOperandVal(0);
	uint64_t Res = (InputValue >> ExtractedElt) & 1;
	return DAG.getConstant(Res, dl, MVT::i1);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	// Only operate on vectors of 4 elements, where the alternative shuffling
	// gets to be more expensive.
	if (SrcVT != MVT::v4i32)
	return SDValue();

	// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
	// single use which is a sign-extend or zero-extend, and all elements are
	// used.
	SmallVector<SDNode *, 4> Uses;
	unsigned ExtractedElements = 0;
	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
	UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
	if (UI.getUse().getResNo() != InputVector.getResNo())
	return SDValue();

	SDNode Extract = UI;
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (Extract->getValueType(0) != MVT::i32)
	return SDValue();
	if (!Extract->hasOneUse())
	return SDValue();
	if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
	Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	if (!isa<ConstantSDNode>(Extract->getOperand(1)))
	return SDValue();

	// Record which element was extracted.
	ExtractedElements \|= 1 << Extract->getConstantOperandVal(1);
	Uses.push_back(Extract);
	}

	// If not all the elements were used, this may not be worthwhile.
	if (ExtractedElements != 15)
	return SDValue();

	// Ok, we've now decided to do the transformation.
	// If 64-bit shifts are legal, use the extract-shift sequence,
	// otherwise bounce the vector off the cache.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Vals[4];

	if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
	SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
	auto &DL = DAG.getDataLayout();
	EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(0, dl, VecIdxTy));
	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(1, dl, VecIdxTy));

	SDValue ShAmt = DAG.getConstant(
	32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
	Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
	Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
	Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
	Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
	} else {
	// Store the value to a temporary stack slot.
	SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
	MachinePointerInfo());

	EVT ElementType = SrcVT.getVectorElementType();
	unsigned EltSize = ElementType.getSizeInBits() / 8;

	// Replace each use (extract) with a load of the appropriate element.
	for (unsigned i = 0; i < 4; ++i) {
	uint64_t Offset = EltSize * i;
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);

	SDValue ScalarAddr =
	DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);

	// Load the scalar.
	Vals[i] =
	DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
	}
	}

	// Replace the extracts
	for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
	UE = Uses.end(); UI != UE; ++UI) {
	SDNode Extract = UI;

	uint64_t IdxVal = Extract->getConstantOperandVal(1);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
	}

	// The replacement was made in place; return N so it won't be revisited.
	return SDValue(N, 0);
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
	CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
	DAG.getAllOnesConstant(DL, CondVT));
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s.
	if (!TValIsAllOnes && !FValIsAllZeros &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	// If this is a bitcasted op that can be represented as another type, push the
	// the bitcast to the inputs. This allows more opportunities for pattern
	// matching masked instructions. This is called when we know that the operation
	// is used as one of the inputs of a vselect.
	static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Make sure we have a bitcast.
	if (OrigOp.getOpcode() != ISD::BITCAST)
	return false;

	SDValue Op = OrigOp.getOperand(0);

	// If the operation is used by anything other than the bitcast, we shouldn't
	// do this combine as that would replicate the operation.
	if (!Op.hasOneUse())
	return false;

	MVT VT = OrigOp.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	SDLoc DL(Op.getNode());

	auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
	SDValue Op2) {
	Op0 = DAG.getBitcast(VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	Op1 = DAG.getBitcast(VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
	return true;
	};

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SHUF128: {
	if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}
	case X86ISD::SUBV_BROADCAST: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0));
	return true;
	}
	}

	return false;
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	// Get the LHS/RHS of the select.
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation for all 128 and 256-bit vectors of i8 and i16.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.is128BitVector() \|\| VT.is256BitVector()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX())) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	DCI.AddToWorklist(Cond.getNode());
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
	((Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) \|\|
	(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
	return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
	if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	CondRHSConst->getAPIntValue() ==
	(-OpRHSConst->getAPIntValue() - 1))
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask())
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
	}
	}
	}

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	// If this is a dynamic select (non-constant condition) and we can match
	// this node with one of the variable blend instructions, restructure the
	// condition so that blends can use the high (sign) bit of each element and
	// use SimplifyDemandedBits to simplify the condition operand.
	if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
	!DCI.isBeforeLegalize() &&
	!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
	unsigned BitWidth = Cond.getScalarValueSizeInBits();

	// Don't optimize vector selects that map to mask-registers.
	if (BitWidth == 1)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Make sure it is fine and update all the nodes
	// so that we do not use the generic VSELECT anymore. Otherwise, we may
	// perform wrong optimizations as we messed with the actual expectation
	// for the vector boolean values.
	if (Cond != TLO.Old) {
	// Check all uses of the condition operand to check whether it will be
	// consumed by non-BLEND instructions. Those may require that all bits
	// are set properly.
	for (SDNode *U : Cond->uses()) {
	// TODO: Add other opcodes eventually lowered into BLEND.
	if (U->getOpcode() != ISD::VSELECT)
	return SDValue();
	}

	// Update all users of the condition before committing the change, so
	// that the VSELECT optimizations that expect the correct vector boolean
	// value will not be triggered.
	for (SDNode *U : Cond->uses()) {
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
	U->getValueType(0), Cond, U->getOperand(1),
	U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue();
	}
	// Only Cond (rather than other nodes in the computation chain) was
	// changed. Change the condition just for N to keep the opportunity to
	// optimize all other users their own way.
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
	return SDValue();
	}
	}

	// Look for vselects with LHS/RHS being bitcasted from an operation that
	// can be executed on another type. Push the bitcast to the inputs of
	// the operation. This exposes opportunities for using masking instructions.
	if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
	CondVT.getVectorElementType() == MVT::i1) {
	if (combineBitcastForMaskedOp(LHS, DAG, DCI))
	return SDValue(N, 0);
	if (combineBitcastForMaskedOp(RHS, DAG, DCI))
	return SDValue(N, 0);
	}

	// Custom action for SELECT MMX
	if (VT == MVT::x86mmx) {
	LHS = DAG.getBitcast(MVT::i64, LHS);
	RHS = DAG.getBitcast(MVT::i64, RHS);
	SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	return DAG.getBitcast(VT, newSelect);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	// If the comparision uses the CF flag we can't use INC/DEC instructions.
	bool NeedCF = false;
	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	}
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	if (Carry.getConstantOperandVal(0) == X86::COND_B)
	return Carry.getOperand(1);
	}
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	switch (Cond.getOpcode()) {
	default: break;
	case X86ISD::BSR:
	case X86ISD::BSF:
	// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
	if (DAG.isKnownNeverZero(Cond.getOperand(0)))
	return (CC == X86::COND_E) ? FalseOp : TrueOp;
	}
	}

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
	// compute signbits for it separately.
	if (Opd.getOpcode() == ISD::ANY_EXTEND) {
	// For anyextend, it is safe to assume an appropriate number of leading
	// sign/zero bits.
	if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
	SignBits[i] = 25;
	else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
	MVT::i16)
	SignBits[i] = 17;
	else
	return false;
	IsPositive[i] = true;
	} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
	// All the operands of BUILD_VECTOR need to be int constant.
	// Find the smallest value range which all the operands belong to.
	SignBits[i] = 32;
	IsPositive[i] = true;
	for (const SDValue &SubOp : Opd.getNode()->op_values()) {
	if (SubOp.isUndef())
	continue;
	auto *CN = dyn_cast<ConstantSDNode>(SubOp);
	if (!CN)
	return false;
	APInt IntVal = CN->getAPIntValue();
	if (IntVal.isNegative())
	IsPositive[i] = false;
	SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
	}
	} else {
	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	if (Opd.getOpcode() == ISD::ZERO_EXTEND)
	IsPositive[i] = true;
	}
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	// If the upper 17 bits of each element are zero then we can use PMADD.
	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
	DAG.MaskedValueIsZero(N1, Mask17))
	return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
	DAG.getBitcast(MVT::v8i16, N1));

	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (NumElts >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8) {
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);
	} else {
	MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}
	} else {
	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	} else {
	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getBitcast(ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, SDLoc DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(9, DL, VT));
	Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => sub ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ false);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 13 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 14:
	// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(3, 2, /isAdd/ true));
	case 26:
	// mul x, 26 => sub ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ false);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(/isAdd/ true));
	case 30:
	// mul x, 30 => sub (sub ((shl x, 5), x), x)
	return DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(5, DL, MVT::i8)),
	N->getOperand(0)),
	N->getOperand(0));
	}
	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	uint64_t MulAmt = C->getZExtValue();
	if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)
	return SDValue();

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((MulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = MulAmt / 9;
	} else if ((MulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = MulAmt / 5;
	} else if ((MulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = MulAmt / 3;
	}

	SDLoc DL(N);
	SDValue NewMul;
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){

	if (isPowerOf2_64(MulAmt2) &&
	!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);

	if (!NewMul) {
	assert(MulAmt != 0 &&
	MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	int64_t SignMulAmt = C->getSExtValue();
	if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
	(SignMulAmt != -INT64_MAX)) {
	int NumSign = SignMulAmt > 0 ? 1 : -1;
	bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
	bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
	if (IsPowerOf2_64PlusOne) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
	MVT::i8)));
	} else if (IsPowerOf2_64MinusOne) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
	MVT::i8)),
	N->getOperand(0));
	}
	// To negate, subtract the number from zero
	if ((IsPowerOf2_64PlusOne \|\| IsPowerOf2_64MinusOne) && NumSign == -1)
	NewMul =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
	}
	}

	if (NewMul)
	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, NewMul, false);

	return SDValue();
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();
	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (N->getOpcode() == ISD::SHL)
	if (SDValue V = combineShiftLeft(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRA)
	if (SDValue V = combineShiftRightArithmetic(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRL)
	if (SDValue V = combineShiftRightLogical(N, DAG))
	return V;

	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected shift opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0->isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1->isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
	bool IsSigned = (X86ISD::PACKSS == Opcode);

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
	if (LogicalShift)
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

	// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
	// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
	// TODO - support other sra opcodes as needed.
	if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
	N0.getOpcode() == X86ISD::VSRAI)
	return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
	N1 == N0.getOperand(1)) {
	SDValue N00 = N0.getOperand(0);
	unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
	if (ShiftVal.ult(NumSignBits))
	return N00;
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	unsigned ShiftImm = ShiftVal.getZExtValue();
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftImm;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftImm);
	else
	Elt.lshrInPlace(ShiftImm);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(
	((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW &&
	N->getValueType(0) == MVT::v8i16)) &&
	"Unexpected vector insertion");

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0->getOperand(1);
	SDValue CMP1 = N1->getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	N->getSimpleValueType(0), FSetCC,
	DAG.getIntPtrConstant(0, DL));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
	return SDValue();

	if (N0.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

	if (N1.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0->getOperand(0).getValueType() != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc &&
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0->getOperand(0);
	if (RHSTrunc)
	N1 = N1->getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned FPOpcode = ISD::DELETED_NODE;
	if (N->getOpcode() == ISD::AND)
	FPOpcode = X86ISD::FAND;
	else if (N->getOpcode() == ISD::OR)
	FPOpcode = X86ISD::FOR;
	else if (N->getOpcode() == ISD::XOR)
	FPOpcode = X86ISD::FXOR;

	assert(FPOpcode != ISD::DELETED_NODE &&
	"Unexpected input node for FP logic conversion");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	((Subtarget.hasSSE1() && VT == MVT::i32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::i64))) {
	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();
	if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}
	}
	return SDValue();
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, VT);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 2,
	/HasVarMask/ false, DAG, DCI, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return SDValue();

	SDValue Mask = N1.getOperand(0);
	SDValue X = N1.getOperand(1);
	SDValue Y;
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (!Y.getNode())
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
	DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};
	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;

	if (V) {
	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}
	}

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, VT));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	EVT VT = OR->getValueType(0);
	SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();
	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = X86ISD::SHLD;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\|
	ShAmt0.getOpcode() == ISD::XOR) {
	Opc = X86ISD::SHRD;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	}

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
	unsigned Bits = VT.getSizeInBits();
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
	return DAG.getNode(Opc, DL, VT,
	Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	}
	} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return DAG.getNode(Opc, DL, VT,
	N0.getOperand(0), N1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandVal(1) == 1) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
	if (!ShiftBV)
	return SDValue();

	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	auto *ShiftAmt = ShiftBV->getConstantSplatNode();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector() \|\| !SrcVT.isSimple() \|\| SrcVT.getSizeInBits() > 512)
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (SrcElVT.getSizeInBits() < 16 \|\| SrcElVT.getSizeInBits() > 64)
	return false;
	if (DstElVT.getSizeInBits() < 8 \|\| DstElVT.getSizeInBits() > 32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT) {
	if (In.getOpcode() != ISD::UMIN)
	return SDValue();

	//Saturation with truncation. We truncate from InVT to VT.
	assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	APInt C;
	if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
	// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
	SDValue();
	}
	return SDValue();
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT);
	}

	static SDValue
	combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
	return SDValue();
	if (auto USatVal = detectUSatPattern(In, VT))
	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !VT.isSimple())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	uint64_t Val = C->getZExtValue();
	if (Val < Min \|\| Val > Max)
	return false;
	}
	return true;
	};

	// Split vectors to legal target size and apply AVG.
	auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
	unsigned NumSubs = 1;
	if (Subtarget.hasBWI()) {
	if (VT.getSizeInBits() > 512)
	NumSubs = VT.getSizeInBits() / 512;
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256)
	NumSubs = VT.getSizeInBits() / 256;
	} else {
	if (VT.getSizeInBits() > 128)
	NumSubs = VT.getSizeInBits() / 128;
	}

	if (NumSubs == 1)
	return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);

	SmallVector<SDValue, 4> Subs;
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	VT.getVectorNumElements() / NumSubs);
	for (unsigned i = 0; i != NumSubs; ++i) {
	unsigned Idx = i * SubVT.getVectorNumElements();
	SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
	SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
	Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
	}

	if (Operands[0].getOpcode() == ISD::ADD)
	std::swap(Operands[0], Operands[1]);
	else if (Operands[1].getOpcode() != ISD::ADD)
	return SDValue();
	Operands[2] = Operands[1].getOperand(0);
	Operands[1] = Operands[1].getOperand(1);

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two are promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();

	// The pattern is detected, emit X86ISD::AVG instruction.
	return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned AddressSpace = Ld->getAddressSpace();
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	AddressSpace, Alignment, &Fast) && !Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Ptr = Ld->getBasePtr();

	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems/2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());

	Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
	SDValue Load2 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
	Ld->getPointerInfo().getWithOffset(16),
	MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1),
	Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
	Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getSrc0().isUndef())
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::SEXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert Src0 value.
	SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
	if (!Mld->getSrc0().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}

	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WideSrc0,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);
	SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
	return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

	if (Mst->isCompressingStore())
	return SDValue();

	if (!Mst->isTruncatingStore()) {
	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask is checking (0 > X), we're creating a vector with all-zeros
	// or all-ones elements based on the sign bits of X. AVX1 masked store only
	// cares about the sign bit of each mask element, so eliminate the compare:
	// mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
	// Note that by waiting to match an x86-specific PCMPGT node, we're
	// eliminating potentially more complex matching of a setcc node which has
	// a full range of predicates.
	SDValue Mask = Mst->getMask();
	if (Mask.getOpcode() == X86ISD::PCMPGT &&
	ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
	assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
	"Unexpected type for PCMPGT");
	return DAG.getMaskedStore(
	Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
	Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
	}

	// TODO: AVX512 targets should also be able to simplify something like the
	// pattern above, but that pattern will be different. It will either need to
	// match setcc more generally or match PCMPGTM later (in tablegen?).

	return SDValue();
	}

	// Resolve truncating stores.
	EVT VT = Mst->getValue().getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	unsigned AddressSpace = St->getAddressSpace();
	unsigned Alignment = St->getAlignment();
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	AddressSpace, Alignment, &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
	SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Value1, Ptr1,
	St->getPointerInfo().getWithOffset(16),
	MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT.isVector() \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
	SmallVector<SDValue, 8> Ops;

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());
	// Make sure new loads are placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
	DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	/// Note that the binary operation should have the property that if one of the
	/// operands is UNDEF then the result is UNDEF.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
	// Look for the following pattern: if
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	// At least one of the operands should be a vector shuffle.
	if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
	RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	MVT VT = LHS.getSimpleValueType();

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");

	// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
	// operate independently on 128-bit lanes.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits()/128;
	unsigned NumLaneElts = NumElts / NumLanes;
	assert((NumLaneElts % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	unsigned HalfLaneElts = NumLaneElts/2;

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle then pretend it is the shuffle
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: in what follows a default initialized SDValue represents an UNDEF of
	// type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask(NumElts);
	if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!LHS.getOperand(0).isUndef())
	A = LHS.getOperand(0);
	if (!LHS.getOperand(1).isUndef())
	B = LHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), LMask.begin());
	} else {
	if (!LHS.isUndef())
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask[i] = i;
	}

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask(NumElts);
	if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!RHS.getOperand(0).isUndef())
	C = RHS.getOperand(0);
	if (!RHS.getOperand(1).isUndef())
	D = RHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), RMask.begin());
	} else {
	if (!RHS.isUndef())
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask[i] = i;
	}

	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D) && !(A == D && B == C))
	return false;

	// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
	if (!A.getNode() && !B.getNode())
	return false;

	// If A and B occur in reverse order in RHS, then "swap" them (which means
	// rewriting the mask).
	if (A != C)
	ShuffleVectorSDNode::commuteMask(RMask);

	// At this point LHS and RHS are equivalent to
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// RHS = VECTOR_SHUFFLE A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
	for (unsigned i = 0; i != NumLaneElts; ++i) {
	int LIdx = LMask[i+l], RIdx = RMask[i+l];

	// Ignore any UNDEF components.
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
	int Index = 2(i%HalfLaneElts) + NumEltsSrc + l;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, IsFadd)) {
	auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned Opcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// Repeated operand, so we are only trading one output truncation for
	// one input truncation.
	if (Op0 == Op1)
	return true;

	// See if either operand has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode0 = Op0.getOpcode();
	if ((Opcode0 == ISD::ANY_EXTEND \|\| Opcode0 == ISD::SIGN_EXTEND \|\|
	Opcode0 == ISD::ZERO_EXTEND) &&
	Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	unsigned Opcode1 = Op1.getOpcode();
	if ((Opcode1 == ISD::ANY_EXTEND \|\| Opcode1 == ISD::SIGN_EXTEND \|\|
	Opcode1 == ISD::ZERO_EXTEND) &&
	Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if either operand is a single use constant which can be constant
	// folded.
	SDValue BC0 = peekThroughOneUseBitcasts(Op0);
	SDValue BC1 = peekThroughOneUseBitcasts(Op1);
	return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) \|\|
	ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!N->isOnlyUserOf(Src.getNode()))
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (Opcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
	!Subtarget.hasDQI())
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	// TODO: ISD::SUB should be here but interferes with combineSubToSubus.
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
	static SDValue
	combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|
	Regs[0].getValueType() == MVT::v2i64));
	EVT OutVT = N->getValueType(0);
	EVT OutSVT = OutVT.getVectorElementType();
	EVT InVT = Regs[0].getValueType();
	EVT InSVT = InVT.getVectorElementType();
	SDLoc DL(N);

	// First, use mask to unset all bits that won't appear in the result.
	assert((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) &&
	"OutSVT can only be either i8 or i16.");
	APInt Mask =
	APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
	SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
	for (auto &Reg : Regs)
	Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);

	MVT UnpackedVT, PackedVT;
	if (OutSVT == MVT::i8) {
	UnpackedVT = MVT::v8i16;
	PackedVT = MVT::v16i8;
	} else {
	UnpackedVT = MVT::v4i32;
	PackedVT = MVT::v8i16;
	}

	// In each iteration, truncate the type by a half size.
	auto RegNum = Regs.size();
	for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
	j < e; j *= 2, RegNum /= 2) {
	for (unsigned i = 0; i < RegNum; i++)
	Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
	for (unsigned i = 0; i < RegNum / 2; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
	Regs[i * 2 + 1]);
	}

	// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
	// then extract a subvector as the result since v8i8 is not a legal type.
	if (OutVT == MVT::v8i8) {
	Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
	Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
	DAG.getIntPtrConstant(0, DL));
	return Regs[0];
	} else if (RegNum > 1) {
	Regs.resize(RegNum);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue
	combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
	EVT OutVT = N->getValueType(0);
	SDLoc DL(N);

	// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
	SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
	for (auto &Reg : Regs) {
	Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	}

	for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
	Regs[i * 2 + 1]);

	if (Regs.size() > 2) {
	Regs.resize(Regs.size() / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);

	// Split a long vector into vectors of legal type.
	unsigned RegNum = InVT.getSizeInBits() / 128;
	SmallVector<SDValue, 8> SubVec(RegNum);
	unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
	EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);

	for (unsigned i = 0; i < RegNum; i++)
	SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
	DAG.getIntPtrConstant(i * NumSubRegElts, DL));

	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
	else if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
	else
	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known;
	DAG.computeKnownBits(In, Known);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to combine truncation with unsigned saturation.
	if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	static SDValue isFNEG(SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
	return SDValue();

	SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
	if (!Op1.getValueType().isFloatingPoint())
	return SDValue();

	SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));

	unsigned EltBits = Op1.getScalarValueSizeInBits();
	auto isSignMask = [&](const ConstantFP *C) {
	return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
	};

	// There is more than one way to represent the same constant on
	// the different X86 targets. The type of the node may also depend on size.
	// - load scalar value and broadcast
	// - BUILD_VECTOR node
	// - load from a constant pool.
	// We check all variants here.
	if (Op1.getOpcode() == X86ISD::VBROADCAST) {
	if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
	if (isSignMask(cast<ConstantFP>(C)))
	return Op0;

	} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
	if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
	if (isSignMask(CN->getConstantFPValue()))
	return Op0;

	} else if (auto *C = getTargetConstantFromNode(Op1)) {
	if (C->getType()->isVectorTy()) {
	if (auto *SplatV = C->getSplatValue())
	if (isSignMask(cast<ConstantFP>(SplatV)))
	return Op0;
	} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
	if (isSignMask(FPConst))
	return Op0;
	}
	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(N);
	assert(Arg.getNode() && "N is expected to be an FNEG node");

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse()) {
	switch (Arg.getOpcode()) {
	case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (VT.isVector() && Subtarget.hasSSE2()) {
	SDLoc dl(N);

	MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}
	return SDValue();
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHSC \|\| RHSC->getZExtValue() != 1 \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (isFNEG(N))
	return combineFneg(N, DAG, Subtarget);
	return SDValue();
	}


	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (isFNEG(N))
	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	// TODO: Check for global or instruction-level "nnan". In that case, we
	// should be able to lower to FMAX/FMIN alone.
	// TODO: If an operand is already known to be a NaN or not a NaN, this
	// should be an optional swap and FMAX/FMIN.

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && (VT == MVT::f32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasSSE2() && (VT == MVT::f64 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))))
	return SDValue();

	// This takes at least 3 instructions, so favor a library call when operating
	// on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

	EVT VT = N->getValueType(0);

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
	return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

	return SDValue();
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
	/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
	/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
	/// extends from AH (which we otherwise need to do contortions to access).
	static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	auto OpcodeN = N->getOpcode();
	auto OpcodeN0 = N0.getOpcode();
	if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) \|\|
	(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
	return SDValue();

	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	if (N0.getResNo() != 1 \|\| InVT != MVT::i8 \|\|
	!(VT == MVT::i32 \|\| VT == MVT::i64))
	return SDValue();

	SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
	auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
	: X86ISD::UDIVREM8_ZEXT_HREG;
	SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
	N0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
	// If this was a 64-bit extend, complete it.
	if (VT == MVT::i64)
	return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
	return R.getValue(1);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV)
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	bool DoPromoteCMOV =
	(VT == MVT::i16 && (TargetVT == MVT::i32 \|\| TargetVT == MVT::i64)) &&
	CMovN.hasOneUse() &&
	(isa<ConstantSDNode>(CMovOp0.getNode()) &&
	isa<ConstantSDNode>(CMovOp1.getNode()));

	if (!DoPromoteCMOV)
	return SDValue();

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);

	return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	}
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// On AVX2+ targets, if the input/output types are both legal then we will be
	// able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT InVT = N.getValueType();
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
	Size / InVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
	DAG.getUNDEF(InVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.hasAVX512())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	return Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
	: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
	: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX2 targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [](SDValue &V) {
	if (SDValue NegVal = isFNEG(V.getNode())) {
	V = NegVal;
	return true;
	}
	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
	N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
	N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);

	// Negative multiplication when NegA xor NegB
	bool NegMul = (NegA != NegB);
	bool HasNeg = NegA \|\| NegB \|\| NegC;

	unsigned NewOpcode;
	if (!NegMul)
	NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
	else
	NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;

	// For FMA, we risk reconstructing the node we started with.
	// In order to avoid this, we check for negation or opcode change. If
	// one of the two happened, then it is a new node and we return it.
	if (N->getOpcode() == ISD::FMA) {
	if (HasNeg \|\| NewOpcode != N->getOpcode())
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	return SDValue();
	}

	if (N->getOpcode() == X86ISD::FMADD_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADD4S) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
	}
	} else {
	llvm_unreachable("Unexpected opcode!");
	}

	// Only return the node is the opcode was changed or one of the
	// operand was negated. If not, we'll just recreate the same node.
	if (HasNeg \|\| NewOpcode != N->getOpcode()) {
	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	return SDValue();
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	SDValue NegVal = isFNEG(N->getOperand(2).getNode());
	if (!NegVal)
	return SDValue();

	unsigned NewOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
	}

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
	X.getOperand(0).getOpcode() == ISD::XOR &&
	X.getOperand(1).getOpcode() == ISD::XOR;
	if (isNullConstant(Y) && !IsOrXorXorCCZero)
	return SDValue();

	// Bail out if we know that this is not really just an oversized integer.
	if (peekThroughBitcasts(X).getValueType() == MVT::f128 \|\|
	peekThroughBitcasts(Y).getValueType() == MVT::f128)
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	// TODO: Add support for AVX-512.
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2())) {
	EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
	SDValue Cmp;
	if (IsOrXorXorCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
	SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
	SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
	SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
	SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
	SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
	Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
	} else {
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);
	Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
	}
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	EVT OpVT = LHS.getValueType();
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Put build_vectors on the right.
	if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}

	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());

	// MOVMSK only uses the MSB from each vector element.
	KnownBits Known;
	APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
	DCI.AddToWorklist(Src.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	if (DCI.isBeforeLegalizeOps()) {
	SDValue Index = N->getOperand(4);
	// Remove any sign extends from 32 or smaller to larger than 32.
	// Only do this before LegalizeOps in case we need the sign extend for
	// legalization.
	if (Index.getOpcode() == ISD::SIGN_EXTEND) {
	if (Index.getScalarValueSizeInBits() > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	// The original sign extend has less users, add back to worklist in case
	// it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// Make sure the index is either i32 or i64
	unsigned ScalarSize = Index.getScalarValueSizeInBits();
	if (ScalarSize != 32 && ScalarSize != 64) {
	MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index;
	DAG.UpdateNodeOperands(N, NewOps);
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	// Try to remove zero extends from 32->64 if we know the sign bit of
	// the input is zero.
	if (Index.getOpcode() == ISD::ZERO_EXTEND &&
	Index.getScalarValueSizeInBits() == 64 &&
	Index.getOperand(0).getScalarValueSizeInBits() == 32) {
	if (DAG.SignBitIsZero(Index.getOperand(0))) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	// The original zero extend has less users, add back to worklist in case
	// it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}
	}

	// Gather and Scatter instructions use k-registers for masks. The type of
	// the masks is v*i1. So the mask will be truncated anyway.
	// The SIGN_EXTEND_INREG my be dropped.
	SDValue Mask = N->getOperand(2);
	if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[2] = Mask.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	return SDValue(N, 0);
	}

	// With AVX2 we only demand the upper bit of the mask.
	if (!Subtarget.hasAVX512()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	KnownBits Known;
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
	DCI.AddToWorklist(Mask.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && (InSVT == MVT::i8 \|\| InSVT == MVT::i16)) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() &&
	(InSVT == MVT::i8 \|\| InSVT == MVT::i16 \|\|
	(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}
	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
	/// which is more useful than 0/1 in some cases.
	static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
	SDLoc DL(N);
	// "Condition code B" is also known as "the carry flag" (CF).
	SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
	SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
	MVT VT = N->getSimpleValueType(0);
	if (VT == MVT::i8)
	return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));

	assert(VT == MVT::i1 && "Unexpected type for SETCC node");
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> X + (mask SBB Z, Z)
	// X - SETB Z --> X - (mask SBB Z, Z)
	// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
	SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue MulOp = N->getOperand(0);
	SDValue Phi = N->getOperand(1);

	if (MulOp.getOpcode() != ISD::MUL)
	std::swap(MulOp, Phi);
	if (MulOp.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	EVT VT = N->getValueType(0);

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;
	unsigned VectorSize = VT.getVectorNumElements() * 16;
	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (VectorSize < 128 \|\| VectorSize > RegSize)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

	// Madd vector size is half of the original vector size
	SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
	// Fill the rest of the output with 0
	SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
	return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a vector select.
	SDValue SelectOp, Phi;
	if (Op0.getOpcode() == ISD::VSELECT) {
	SelectOp = Op0;
	Phi = Op1;
	} else if (Op1.getOpcode() == ISD::VSELECT) {
	SelectOp = Op1;
	Phi = Op0;
	} else
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	if(!detectZextAbsDiff(SelectOp, Op0, Op1))
	return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.
	SDValue Zero = DAG.getConstant(0, DL, VT);
	Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
	DAG.getIntPtrConstant(0, DL));
	}

	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDNode *N1 = N->getOperand(1).getNode();
	APInt SplatVal;
	if (!ISD::isConstantSplatVector(N1, SplatVal) \|\|
	!SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// PSUBUS is supported, starting from SSE2, but special preprocessing
	// for v8i32 requires umin, which appears in SSE41.
	if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&
	!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
	!(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&
	!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
	(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|
	VT == MVT::v8i64)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR cannonicialization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else
	return SDValue();

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
	return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known;
	DAG.computeKnownBits(SubusLHS, Known);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
	NewSubusLHS, NewSubusRHS);
	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, false))
	return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDLoc DL(N);
	unsigned Opcode = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	MVT SVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	SDValue Op = N->getOperand(0);
	MVT OpVT = Op.getSimpleValueType();
	MVT OpEltVT = OpVT.getVectorElementType();
	unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
	unsigned InputBits = OpEltSizeInBits * NumElts;

	// Perform any constant folding.
	// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
	APInt Undefs(NumElts, 0);
	SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
	bool IsZEXT =
	(Opcode == X86ISD::VZEXT) \|\| (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (UndefElts[i]) {
	Undefs.setBit(i);
	continue;
	}
	Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
	: EltBits[i].sextOrTrunc(EltSizeInBits);
	}
	return getConstVector(Vals, Undefs, VT, DAG, DL);
	}

	// (vzext (bitcast (vzext (x)) -> (vzext x)
	// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
	SDValue V = peekThroughBitcasts(Op);
	if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
	MVT InnerVT = V.getSimpleValueType();
	MVT InnerEltVT = InnerVT.getVectorElementType();

	// If the element sizes match exactly, we can just do one larger vzext. This
	// is always an exact type match as vzext operates on integer types.
	if (OpEltVT == InnerEltVT) {
	assert(OpVT == InnerVT && "Types must match for vzext!");
	return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
	}

	// The only other way we can combine them is if only a single element of the
	// inner vzext is used in the input to the outer vzext.
	if (InnerEltVT.getSizeInBits() < InputBits)
	return SDValue();

	// In this case, the inner vzext is completely dead because we're going to
	// only look at bits inside of the low element. Just do the outer vzext on
	// a bitcast of the input to the inner.
	return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
	}

	// Check if we can bypass extracting and re-inserting an element of an input
	// vector. Essentially:
	// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
	// TODO: Add X86ISD::VSEXT support
	if (Opcode == X86ISD::VZEXT &&
	V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
	SDValue ExtractedV = V.getOperand(0);
	SDValue OrigV = ExtractedV.getOperand(0);
	if (isNullConstant(ExtractedV.getOperand(1))) {
	MVT OrigVT = OrigV.getSimpleValueType();
	// Extract a subvector if necessary...
	if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
	int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
	OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
	OrigVT.getVectorNumElements() / Ratio);
	OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
	DAG.getIntPtrConstant(0, DL));
	}
	Op = DAG.getBitcast(OpVT, OrigV);
	return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
	}
	}

	return SDValue();
	}

	static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// TEST (AND a, b) ,(AND a, b) -> TEST a, b
	if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
	return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
	Op0->getOperand(1));

	// TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
	// TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
	if (ISD::isBuildVectorAllZeros(Op0.getNode()) \|\|
	ISD::isBuildVectorAllZeros(Op1.getNode()))
	return getZeroVector(VT, Subtarget, DAG, DL);

	return SDValue();
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return getOnesVector(VT, DAG, DL);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	// Early out for mask vectors.
	if (OpVT.getVectorElementType() == MVT::i1)
	return SDValue();

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	unsigned IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// Inserting zeros into zeros is a nop.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	return Vec;

	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	unsigned Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting a bitcast into zeros, rewrite the insert and move the
	// bitcast to the other side. This helps with detecting zero extending
	// during isel.
	// TODO: Is this useful for other indices than 0?
	if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
	MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
	unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
	MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
	SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
	DAG.getBitcast(NewVT, Vec),
	SubVec.getOperand(0), N->getOperand(2));
	return DAG.getBitcast(OpVT, Insert);
	}
	}

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
	// load:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr + 16), Elts/2)
	// --> load32 addr
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr + 32), Elts/2)
	// --> load64 addr
	// or a 16-byte or 32-byte broadcast:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr), Elts/2)
	// --> X86SubVBroadcast(load16 addr)
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr), Elts/2)
	// --> X86SubVBroadcast(load32 addr)
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
	auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
	if (Idx2 && Idx2->getZExtValue() == 0) {
	SDValue SubVec2 = Vec.getOperand(1);
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
	bool Fast;
	unsigned Alignment = FirstLd->getAlignment();
	unsigned AS = FirstLd->getAddressSpace();
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	OpVT, AS, Alignment, &Fast) && Fast) {
	SDValue Ops[] = {SubVec2, SubVec};
	if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
	Subtarget, false))
	return Ld;
	}
	}
	// If lower/upper loads are the same and the only users of the load, then
	// lower to a VBROADCASTF128/VBROADCASTI128/etc.
	if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
	if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
	SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);

	// If this is subv_broadcast insert into both halves, use a larger
	// subv_broadcast.
	if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
	SubVec.getOperand(0));

	// If we're inserting all zeros into the upper half, change this to
	// an insert into an all zeros vector. We will match this to a move
	// with implicit upper bit zeroing during isel.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
	Vec.getOperand(2));

	// If we are inserting into both halves of the vector, the starting
	// vector should be undef. If it isn't, make it so. Only do this if the
	// the early insert has no other uses.
	// TODO: Should this be a generic DAG combine?
	if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
	SubVec2, Vec.getOperand(2));
	DCI.AddToWorklist(Vec.getNode());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
	N->getOperand(2));

	}
	}
	}

	return SDValue();
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);
	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (OpVT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), OpVT);
	return getOnesVector(OpVT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	OpVT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VSEXT:
	case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMADDS1_RND:
	case X86ISD::FMADDS3_RND:
	case X86ISD::FMADDS1:
	case X86ISD::FMADDS3:
	case X86ISD::FMADD4S:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER:
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
	case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	}

	return SDValue();
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer and
	/// some i16 instructions are slow.
	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;
	if (VT != MVT::i16)
	return true;

	switch (Opc) {
	default:
	return true;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
	/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
	/// we don't adjust the stack we clobber the first frame index.
	/// See X86InstrInfo::copyPhysReg.
	static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return any_of(MRI.reg_instructions(X86::EFLAGS),
	[](const MachineInstr &RI) { return RI.isCopy(); });
	}

	void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
	if (hasCopyImplyingStackAdjustment(MF)) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	}

	TargetLoweringBase::finalizeLowering(MF);
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired promotion
	/// type by reference.
	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	if (VT != MVT::i16)
	return false;

	bool Promote = false;
	bool Commute = false;
	switch (Op.getOpcode()) {
	default: break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	Promote = true;
	break;
	case ISD::SHL:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
	return false;
	Promote = true;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (!Commute && MayFoldLoad(N1))
	return false;
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) \|\| MayFoldIntoStore(Op)))
	return false;
	if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) \|\| MayFoldIntoStore(Op)))
	return false;
	Promote = true;
	}
	}

	PVT = MVT::i32;
	return Promote;
	}

	bool X86TargetLowering::
	isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

	assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
	"Element count mismatch");
	assert(
	Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
	"Shuffle Mask expected to be legal");

	// For 32-bit elements VPERMD is better than shuffle+truncate.
	// TODO: After we improve lowerBuildVector, add execption for VPERMW.
	if (SrcVT.getScalarSizeInBits() == 32 \|\| !Subtarget.hasAVX2())
	return false;

	if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'G':
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	case '0':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y': {
	unsigned Size = StringRef(constraint).size();
	// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
	char NextChar = Size == 2 ? constraint[1] : 'i';
	if (Size > 2)
	break;
	switch (NextChar) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	case '0':
	if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'Y'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	// Fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	}
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	GlobalAddressSDNode *GA = nullptr;
	int64_t Offset = 0;

	// Match either (GA), (GA+C), (GA+C1+C2), etc.
	while (1) {
	if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
	Offset += GA->getOffset();
	break;
	} else if (Op.getOpcode() == ISD::ADD) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	} else if (Op.getOpcode() == ISD::SUB) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += -C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	}

	// Otherwise, this isn't something we can handle, reject it.
	return;
	}

	const GlobalValue *GV = GA->getGlobal();
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
	return;

	Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
	GA->getValueType(0), Offset);
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	// Only supported in AVX512 or later.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32RegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16RegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8RegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1RegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	return std::make_pair(0U, &X86::VR256RegClass);
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	return std::make_pair(0U, &X86::VR512RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "Y", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	case '0':
	if (!Subtarget.hasSSE1()) break;
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) { // Only supported in AVX512.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32WMRegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16WMRegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8WMRegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1WMRegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	}
	break;
	}
	}

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' &&
	tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' &&
	Constraint[6] == '}') {

	Res.first = X86::FP0+Constraint[4]-'0';
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint)) {
	Res.first = X86::FP0;
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint)) {
	Res.first = X86::EFLAGS;
	Res.second = &X86::CCRRegClass;
	return Res;
	}

	// 'A' means [ER]AX + [ER]DX.
	if (Constraint == "A") {
	if (Subtarget.is64Bit()) {
	Res.first = X86::RAX;
	Res.second = &X86::GR64_ADRegClass;
	} else {
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	Res.first = X86::EAX;
	Res.second = &X86::GR32_ADRegClass;
	}
	return Res;
	}
	return Res;
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: &X86::GR64RegClass;
	if (RC->contains(DestReg))
	Res = std::make_pair(DestReg, RC);
	} else {
	// No register found/type mismatch.
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32RegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
	Res.second = &X86::VR128RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
	Res.second = &X86::VR256RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%drx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO())
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	Index: vendor/llvm/dist-release_60/lib/Target/X86/X86TargetTransformInfo.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Target/X86/X86TargetTransformInfo.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Target/X86/X86TargetTransformInfo.cpp (revision 328362)
	@@ -1,2860 +1,2861 @@
	//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	/// This file implements a TargetTransformInfo analysis pass specific to the
	/// X86 target machine. It uses the target's detailed information to provide
	/// more precise answers to certain TTI queries, while letting the target
	/// independent and default TTI implementations handle the rest.
	///
	//===----------------------------------------------------------------------===//
	/// About Cost Model numbers used below it's necessary to say the following:
	/// the numbers correspond to some "generic" X86 CPU instead of usage of
	/// concrete CPU model. Usually the numbers correspond to CPU where the feature
	/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
	/// the lookups below the cost is based on Nehalem as that was the first CPU
	/// to support that feature level and thus has most likely the worst case cost.
	/// Some examples of other technologies/CPUs:
	/// SSE 3 - Pentium4 / Athlon64
	/// SSE 4.1 - Penryn
	/// SSE 4.2 - Nehalem
	/// AVX - Sandy Bridge
	/// AVX2 - Haswell
	/// AVX-512 - Xeon Phi / Skylake
	/// And some examples of instruction target dependent costs (latency)
	/// divss sqrtss rsqrtss
	/// AMD K7 11-16 19 3
	/// Piledriver 9-24 13-15 5
	/// Jaguar 14 16 2
	/// Pentium II,III 18 30 2
	/// Nehalem 7-14 7-18 3
	/// Haswell 10-13 11 5
	/// TODO: Develop and implement the target dependent cost model and
	/// specialize cost numbers for different Cost Model Targets such as throughput,
	/// code size, latency and uop count.
	//===----------------------------------------------------------------------===//

	#include "X86TargetTransformInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/BasicTTIImpl.h"
	#include "llvm/CodeGen/CostTable.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/Support/Debug.h"

	using namespace llvm;

	#define DEBUG_TYPE "x86tti"

	//===----------------------------------------------------------------------===//
	//
	// X86 cost model.
	//
	//===----------------------------------------------------------------------===//

	TargetTransformInfo::PopcntSupportKind
	X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
	// TODO: Currently the __builtin_popcount() implementation using SSE3
	// instructions is inefficient. Once the problem is fixed, we should
	// call ST->hasSSE3() instead of ST->hasPOPCNT().
	return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
	}

	llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
	TargetTransformInfo::CacheLevel Level) const {
	switch (Level) {
	case TargetTransformInfo::CacheLevel::L1D:
	// - Penryn
	// - Nehalem
	// - Westmere
	// - Sandy Bridge
	// - Ivy Bridge
	// - Haswell
	// - Broadwell
	// - Skylake
	// - Kabylake
	return 32 * 1024; // 32 KByte
	case TargetTransformInfo::CacheLevel::L2D:
	// - Penryn
	// - Nehalem
	// - Westmere
	// - Sandy Bridge
	// - Ivy Bridge
	// - Haswell
	// - Broadwell
	// - Skylake
	// - Kabylake
	return 256 * 1024; // 256 KByte
	}

	llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
	}

	llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
	TargetTransformInfo::CacheLevel Level) const {
	// - Penryn
	// - Nehalem
	// - Westmere
	// - Sandy Bridge
	// - Ivy Bridge
	// - Haswell
	// - Broadwell
	// - Skylake
	// - Kabylake
	switch (Level) {
	case TargetTransformInfo::CacheLevel::L1D:
	LLVM_FALLTHROUGH;
	case TargetTransformInfo::CacheLevel::L2D:
	return 8;
	}

	llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
	}

	unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
	if (Vector && !ST->hasSSE1())
	return 0;

	if (ST->is64Bit()) {
	if (Vector && ST->hasAVX512())
	return 32;
	return 16;
	}
	return 8;
	}

	unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
	if (Vector) {
	if (ST->hasAVX512())
	return 512;
	if (ST->hasAVX())
	return 256;
	if (ST->hasSSE1())
	return 128;
	return 0;
	}

	if (ST->is64Bit())
	return 64;

	return 32;
	}

	unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
	return getRegisterBitWidth(true);
	}

	unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
	// If the loop will not be vectorized, don't interleave the loop.
	// Let regular unroll to unroll the loop, which saves the overflow
	// check and memory check cost.
	if (VF == 1)
	return 1;

	if (ST->isAtom())
	return 1;

	// Sandybridge and Haswell have multiple execution ports and pipelined
	// vector units.
	if (ST->hasAVX())
	return 4;

	return 2;
	}

	int X86TTIImpl::getArithmeticInstrCost(
	unsigned Opcode, Type *Ty,
	TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
	TTI::OperandValueProperties Opd1PropInfo,
	TTI::OperandValueProperties Opd2PropInfo,
	ArrayRef<const Value *> Args) {
	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	assert(ISD && "Invalid opcode");

	static const CostTblEntry SLMCostTable[] = {
	{ ISD::MUL, MVT::v4i32, 11 }, // pmulld
	{ ISD::MUL, MVT::v8i16, 2 }, // pmullw
	{ ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
	{ ISD::FMUL, MVT::f64, 2 }, // mulsd
	{ ISD::FMUL, MVT::v2f64, 4 }, // mulpd
	{ ISD::FMUL, MVT::v4f32, 2 }, // mulps
	{ ISD::FDIV, MVT::f32, 17 }, // divss
	{ ISD::FDIV, MVT::v4f32, 39 }, // divps
	{ ISD::FDIV, MVT::f64, 32 }, // divsd
	{ ISD::FDIV, MVT::v2f64, 69 }, // divpd
	{ ISD::FADD, MVT::v2f64, 2 }, // addpd
	{ ISD::FSUB, MVT::v2f64, 2 }, // subpd
	// v2i64/v4i64 mul is custom lowered as a series of long:
	// multiplies(3), shifts(3) and adds(2)
	// slm muldq version throughput is 2 and addq throughput 4
	// thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
	// 3X4 (addq throughput) = 17
	{ ISD::MUL, MVT::v2i64, 17 },
	// slm addq\subq throughput is 4
	{ ISD::ADD, MVT::v2i64, 4 },
	{ ISD::SUB, MVT::v2i64, 4 },
	};

	if (ST->isSLM()) {
	if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
	// Check if the operands can be shrinked into a smaller datatype.
	bool Op1Signed = false;
	unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
	bool Op2Signed = false;
	unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);

	bool signedMode = Op1Signed \| Op2Signed;
	unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);

	if (OpMinSize <= 7)
	return LT.first * 3; // pmullw/sext
	if (!signedMode && OpMinSize <= 8)
	return LT.first * 3; // pmullw/zext
	if (OpMinSize <= 15)
	return LT.first * 5; // pmullw/pmulhw/pshuf
	if (!signedMode && OpMinSize <= 16)
	return LT.first * 5; // pmullw/pmulhw/pshuf
	}
	if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
	LT.second)) {
	return LT.first * Entry->Cost;
	}
	}

	if (ISD == ISD::SDIV &&
	Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
	Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
	// On X86, vector signed division by constants power-of-two are
	// normally expanded to the sequence SRA + SRL + ADD + SRA.
	// The OperandValue properties many not be same as that of previous
	// operation;conservatively assume OP_None.
	int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
	Op2Info, TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);

	return Cost;
	}

	static const CostTblEntry AVX512BWUniformConstCostTable[] = {
	{ ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
	{ ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
	{ ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.

	{ ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
	{ ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
	};

	if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
	ST->hasBWI()) {
	if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
	LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry AVX512UniformConstCostTable[] = {
	{ ISD::SRA, MVT::v2i64, 1 },
	{ ISD::SRA, MVT::v4i64, 1 },
	{ ISD::SRA, MVT::v8i64, 1 },

	{ ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
	{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
	};

	if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
	ST->hasAVX512()) {
	if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
	LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry AVX2UniformConstCostTable[] = {
	{ ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
	{ ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
	{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.

	{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.

	{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
	{ ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
	{ ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
	{ ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
	};

	if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
	ST->hasAVX2()) {
	if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
	LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry SSE2UniformConstCostTable[] = {
	{ ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
	{ ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
	{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.

	{ ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
	{ ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
	{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.

	{ ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
	{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
	{ ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
	{ ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
	{ ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
	{ ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
	{ ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
	{ ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
	};

	if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
	ST->hasSSE2()) {
	// pmuldq sequence.
	if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
	return LT.first * 32;
	if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
	return LT.first * 15;

	// XOP has faster vXi8 shifts.
	if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) \|\|
	!ST->hasXOP())
	if (const auto *Entry =
	CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry AVX2UniformCostTable[] = {
	// Uniform splats are cheaper for the following instructions.
	{ ISD::SHL, MVT::v16i16, 1 }, // psllw.
	{ ISD::SRL, MVT::v16i16, 1 }, // psrlw.
	{ ISD::SRA, MVT::v16i16, 1 }, // psraw.
	};

	if (ST->hasAVX2() &&
	((Op2Info == TargetTransformInfo::OK_UniformConstantValue) \|\|
	(Op2Info == TargetTransformInfo::OK_UniformValue))) {
	if (const auto *Entry =
	CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry SSE2UniformCostTable[] = {
	// Uniform splats are cheaper for the following instructions.
	{ ISD::SHL, MVT::v8i16, 1 }, // psllw.
	{ ISD::SHL, MVT::v4i32, 1 }, // pslld
	{ ISD::SHL, MVT::v2i64, 1 }, // psllq.

	{ ISD::SRL, MVT::v8i16, 1 }, // psrlw.
	{ ISD::SRL, MVT::v4i32, 1 }, // psrld.
	{ ISD::SRL, MVT::v2i64, 1 }, // psrlq.

	{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
	{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
	};

	if (ST->hasSSE2() &&
	((Op2Info == TargetTransformInfo::OK_UniformConstantValue) \|\|
	(Op2Info == TargetTransformInfo::OK_UniformValue))) {
	if (const auto *Entry =
	CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry AVX512DQCostTable[] = {
	{ ISD::MUL, MVT::v2i64, 1 },
	{ ISD::MUL, MVT::v4i64, 1 },
	{ ISD::MUL, MVT::v8i64, 1 }
	};

	// Look for AVX512DQ lowering tricks for custom cases.
	if (ST->hasDQI())
	if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX512BWCostTable[] = {
	{ ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
	{ ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
	{ ISD::SRA, MVT::v8i16, 1 }, // vpsravw

	{ ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
	{ ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
	{ ISD::SRA, MVT::v16i16, 1 }, // vpsravw

	{ ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
	{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
	{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw

	{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
	{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
	{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.

	{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.

	// Vectorizing division is a bad idea. See the SSE2 table for more comments.
	{ ISD::SDIV, MVT::v64i8, 64*20 },
	{ ISD::SDIV, MVT::v32i16, 32*20 },
	{ ISD::UDIV, MVT::v64i8, 64*20 },
	{ ISD::UDIV, MVT::v32i16, 32*20 }
	};

	// Look for AVX512BW lowering tricks for custom cases.
	if (ST->hasBWI())
	if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX512CostTable[] = {
	{ ISD::SHL, MVT::v16i32, 1 },
	{ ISD::SRL, MVT::v16i32, 1 },
	{ ISD::SRA, MVT::v16i32, 1 },

	{ ISD::SHL, MVT::v8i64, 1 },
	{ ISD::SRL, MVT::v8i64, 1 },

	{ ISD::SRA, MVT::v2i64, 1 },
	{ ISD::SRA, MVT::v4i64, 1 },
	{ ISD::SRA, MVT::v8i64, 1 },

	{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v16i32, 1 }, // pmulld
	{ ISD::MUL, MVT::v8i64, 8 }, // 3pmuludq/3shift/2*add

	// Vectorizing division is a bad idea. See the SSE2 table for more comments.
	{ ISD::SDIV, MVT::v16i32, 16*20 },
	{ ISD::SDIV, MVT::v8i64, 8*20 },
	{ ISD::UDIV, MVT::v16i32, 16*20 },
	{ ISD::UDIV, MVT::v8i64, 8*20 }
	};

	if (ST->hasAVX512())
	if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX2ShiftCostTable[] = {
	// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
	// customize them to detect the cases where shift amount is a scalar one.
	{ ISD::SHL, MVT::v4i32, 1 },
	{ ISD::SRL, MVT::v4i32, 1 },
	{ ISD::SRA, MVT::v4i32, 1 },
	{ ISD::SHL, MVT::v8i32, 1 },
	{ ISD::SRL, MVT::v8i32, 1 },
	{ ISD::SRA, MVT::v8i32, 1 },
	{ ISD::SHL, MVT::v2i64, 1 },
	{ ISD::SRL, MVT::v2i64, 1 },
	{ ISD::SHL, MVT::v4i64, 1 },
	{ ISD::SRL, MVT::v4i64, 1 },
	};

	// Look for AVX2 lowering tricks.
	if (ST->hasAVX2()) {
	if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
	(Op2Info == TargetTransformInfo::OK_UniformConstantValue \|\|
	Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
	// On AVX2, a packed v16i16 shift left by a constant build_vector
	// is lowered into a vector multiply (vpmullw).
	return LT.first;

	if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;
	}

	static const CostTblEntry XOPShiftCostTable[] = {
	// 128bit shifts take 1cy, but right shifts require negation beforehand.
	{ ISD::SHL, MVT::v16i8, 1 },
	{ ISD::SRL, MVT::v16i8, 2 },
	{ ISD::SRA, MVT::v16i8, 2 },
	{ ISD::SHL, MVT::v8i16, 1 },
	{ ISD::SRL, MVT::v8i16, 2 },
	{ ISD::SRA, MVT::v8i16, 2 },
	{ ISD::SHL, MVT::v4i32, 1 },
	{ ISD::SRL, MVT::v4i32, 2 },
	{ ISD::SRA, MVT::v4i32, 2 },
	{ ISD::SHL, MVT::v2i64, 1 },
	{ ISD::SRL, MVT::v2i64, 2 },
	{ ISD::SRA, MVT::v2i64, 2 },
	// 256bit shifts require splitting if AVX2 didn't catch them above.
	{ ISD::SHL, MVT::v32i8, 2+2 },
	{ ISD::SRL, MVT::v32i8, 4+2 },
	{ ISD::SRA, MVT::v32i8, 4+2 },
	{ ISD::SHL, MVT::v16i16, 2+2 },
	{ ISD::SRL, MVT::v16i16, 4+2 },
	{ ISD::SRA, MVT::v16i16, 4+2 },
	{ ISD::SHL, MVT::v8i32, 2+2 },
	{ ISD::SRL, MVT::v8i32, 4+2 },
	{ ISD::SRA, MVT::v8i32, 4+2 },
	{ ISD::SHL, MVT::v4i64, 2+2 },
	{ ISD::SRL, MVT::v4i64, 4+2 },
	{ ISD::SRA, MVT::v4i64, 4+2 },
	};

	// Look for XOP lowering tricks.
	if (ST->hasXOP())
	if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE2UniformShiftCostTable[] = {
	// Uniform splats are cheaper for the following instructions.
	{ ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
	{ ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
	{ ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.

	{ ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
	{ ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
	{ ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.

	{ ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
	{ ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
	{ ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
	{ ISD::SRA, MVT::v4i64, 8+2 }, // 2(2psrad + shuffle) + split.
	};

	if (ST->hasSSE2() &&
	((Op2Info == TargetTransformInfo::OK_UniformConstantValue) \|\|
	(Op2Info == TargetTransformInfo::OK_UniformValue))) {

	// Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
	if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
	return LT.first * 4; // 2*psrad + shuffle.

	if (const auto *Entry =
	CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
	return LT.first * Entry->Cost;
	}

	if (ISD == ISD::SHL &&
	Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
	MVT VT = LT.second;
	// Vector shift left by non uniform constant can be lowered
	// into vector multiply.
	if (((VT == MVT::v8i16 \|\| VT == MVT::v4i32) && ST->hasSSE2()) \|\|
	((VT == MVT::v16i16 \|\| VT == MVT::v8i32) && ST->hasAVX()))
	ISD = ISD::MUL;
	}

	static const CostTblEntry AVX2CostTable[] = {
	{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
	{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.

	{ ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
	{ ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.

	{ ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
	{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
	{ ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
	{ ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.

	{ ISD::SUB, MVT::v32i8, 1 }, // psubb
	{ ISD::ADD, MVT::v32i8, 1 }, // paddb
	{ ISD::SUB, MVT::v16i16, 1 }, // psubw
	{ ISD::ADD, MVT::v16i16, 1 }, // paddw
	{ ISD::SUB, MVT::v8i32, 1 }, // psubd
	{ ISD::ADD, MVT::v8i32, 1 }, // paddd
	{ ISD::SUB, MVT::v4i64, 1 }, // psubq
	{ ISD::ADD, MVT::v4i64, 1 }, // paddq

	{ ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
	{ ISD::MUL, MVT::v8i32, 1 }, // pmulld
	{ ISD::MUL, MVT::v4i64, 8 }, // 3pmuludq/3shift/2*add

	{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
	{ ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
	{ ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
	{ ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
	};

	// Look for AVX2 lowering tricks for custom cases.
	if (ST->hasAVX2())
	if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX1CostTable[] = {
	// We don't have to scalarize unsupported ops. We can issue two half-sized
	// operations and we only need to extract the upper YMM half.
	// Two ops + 1 extract + 1 insert = 4.
	{ ISD::MUL, MVT::v16i16, 4 },
	{ ISD::MUL, MVT::v8i32, 4 },
	{ ISD::SUB, MVT::v32i8, 4 },
	{ ISD::ADD, MVT::v32i8, 4 },
	{ ISD::SUB, MVT::v16i16, 4 },
	{ ISD::ADD, MVT::v16i16, 4 },
	{ ISD::SUB, MVT::v8i32, 4 },
	{ ISD::ADD, MVT::v8i32, 4 },
	{ ISD::SUB, MVT::v4i64, 4 },
	{ ISD::ADD, MVT::v4i64, 4 },

	// A v4i64 multiply is custom lowered as two split v2i64 vectors that then
	// are lowered as a series of long multiplies(3), shifts(3) and adds(2)
	// Because we believe v4i64 to be a legal type, we must also include the
	// extract+insert in the cost table. Therefore, the cost here is 18
	// instead of 8.
	{ ISD::MUL, MVT::v4i64, 18 },

	{ ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.

	{ ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
	{ ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
	{ ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
	{ ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/

	// Vectorizing division is a bad idea. See the SSE2 table for more comments.
	{ ISD::SDIV, MVT::v32i8, 32*20 },
	{ ISD::SDIV, MVT::v16i16, 16*20 },
	{ ISD::SDIV, MVT::v8i32, 8*20 },
	{ ISD::SDIV, MVT::v4i64, 4*20 },
	{ ISD::UDIV, MVT::v32i8, 32*20 },
	{ ISD::UDIV, MVT::v16i16, 16*20 },
	{ ISD::UDIV, MVT::v8i32, 8*20 },
	{ ISD::UDIV, MVT::v4i64, 4*20 },
	};

	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE42CostTable[] = {
	{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
	{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
	{ ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
	};

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE41CostTable[] = {
	{ ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
	{ ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
	{ ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
	{ ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
	{ ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
	{ ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split

	{ ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
	{ ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
	{ ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
	{ ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
	{ ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
	{ ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.

	{ ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
	{ ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
	{ ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
	{ ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
	{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
	{ ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.

	{ ISD::MUL, MVT::v4i32, 1 } // pmulld
	};

	if (ST->hasSSE41())
	if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE2CostTable[] = {
	// We don't correctly identify costs of casts because they are marked as
	// custom.
	{ ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
	{ ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
	{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
	{ ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
	{ ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.

	{ ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
	{ ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
	{ ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
	{ ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
	{ ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.

	{ ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
	{ ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
	{ ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
	{ ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
	{ ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.

	{ ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
	{ ISD::MUL, MVT::v8i16, 1 }, // pmullw
	{ ISD::MUL, MVT::v4i32, 6 }, // 3pmuludq/4shuffle
	{ ISD::MUL, MVT::v2i64, 8 }, // 3pmuludq/3shift/2*add

	{ ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
	{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
	{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/

	// It is not a good idea to vectorize division. We have to scalarize it and
	// in the process we will often end up having to spilling regular
	// registers. The overhead of division is going to dominate most kernels
	// anyways so try hard to prevent vectorization of division - it is
	// generally a bad idea. Assume somewhat arbitrarily that we have to be able
	// to hide "20 cycles" for each lane.
	{ ISD::SDIV, MVT::v16i8, 16*20 },
	{ ISD::SDIV, MVT::v8i16, 8*20 },
	{ ISD::SDIV, MVT::v4i32, 4*20 },
	{ ISD::SDIV, MVT::v2i64, 2*20 },
	{ ISD::UDIV, MVT::v16i8, 16*20 },
	{ ISD::UDIV, MVT::v8i16, 8*20 },
	{ ISD::UDIV, MVT::v4i32, 4*20 },
	{ ISD::UDIV, MVT::v2i64, 2*20 },
	};

	if (ST->hasSSE2())
	if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE1CostTable[] = {
	{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
	{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
	};

	if (ST->hasSSE1())
	if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
	return LT.first * Entry->Cost;

	// Fallback to the default implementation.
	return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
	}

	int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
	Type *SubTp) {
	// 64-bit packed float vectors (v2f32) are widened to type v4f32.
	// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

	// For Broadcasts we are splatting the first element from the first input
	// register, so only need to reference that input and all the output
	// registers are the same.
	if (Kind == TTI::SK_Broadcast)
	LT.first = 1;

	// We are going to permute multiple sources and the result will be in multiple
	// destinations. Providing an accurate cost only for splits where the element
	// type remains the same.
	if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
	MVT LegalVT = LT.second;
	- if (LegalVT.getVectorElementType().getSizeInBits() ==
	+ if (LegalVT.isVector() &&
	+ LegalVT.getVectorElementType().getSizeInBits() ==
	Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
	LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {

	unsigned VecTySize = DL.getTypeStoreSize(Tp);
	unsigned LegalVTSize = LegalVT.getStoreSize();
	// Number of source vectors after legalization:
	unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
	// Number of destination vectors after legalization:
	unsigned NumOfDests = LT.first;

	Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
	LegalVT.getVectorNumElements());

	unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
	return NumOfShuffles *
	getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
	}

	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
	}

	// For 2-input shuffles, we must account for splitting the 2 inputs into many.
	if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
	// We assume that source and destination have the same vector type.
	int NumOfDests = LT.first;
	int NumOfShufflesPerDest = LT.first * 2 - 1;
	LT.first = NumOfDests * NumOfShufflesPerDest;
	}

	static const CostTblEntry AVX512VBMIShuffleTbl[] = {
	{ TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
	{ TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb

	{ TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
	{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb

	{ TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
	{ TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
	{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
	};

	if (ST->hasVBMI())
	if (const auto *Entry =
	CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX512BWShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
	{ TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb

	{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
	{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
	{ TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2

	{ TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
	{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
	{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
	{ TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
	{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc

	{ TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
	{ TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
	{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
	{ TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
	{ TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
	{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
	};

	if (ST->hasBWI())
	if (const auto *Entry =
	CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX512ShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
	{ TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
	{ TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
	{ TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd

	{ TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
	{ TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
	{ TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
	{ TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd

	{ TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
	{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
	{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
	{ TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
	{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
	{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
	{ TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
	{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
	{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
	{ TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
	{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
	{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
	{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb

	{ TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
	{ TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
	{ TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
	{ TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
	{ TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
	{ TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
	{ TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
	{ TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
	{ TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
	{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
	{ TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
	{ TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
	};

	if (ST->hasAVX512())
	if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX2ShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
	{ TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
	{ TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
	{ TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
	{ TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
	{ TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb

	{ TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
	{ TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
	{ TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
	{ TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
	{ TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
	{ TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb

	{ TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
	{ TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb

	{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
	{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
	{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
	{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
	{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
	// + vpblendvb
	{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
	// + vpblendvb

	{ TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
	{ TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
	{ TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
	{ TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
	{ TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2vperm2i128 + 4vpshufb
	// + vpblendvb
	{ TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2vperm2i128 + 4vpshufb
	// + vpblendvb
	};

	if (ST->hasAVX2())
	if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry XOPShuffleTbl[] = {
	{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
	{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
	{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
	{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
	{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
	// + vinsertf128
	{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
	// + vinsertf128

	{ TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2vextractf128 + 6vpperm
	// + vinsertf128
	{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
	{ TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2vextractf128 + 6vpperm
	// + vinsertf128
	{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
	};

	if (ST->hasXOP())
	if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry AVX1ShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
	{ TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
	{ TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
	{ TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
	{ TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
	{ TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128

	{ TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
	{ TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
	{ TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
	{ TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
	{ TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
	// + vinsertf128
	{ TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
	// + vinsertf128

	{ TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
	{ TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
	{ TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
	{ TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
	{ TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
	{ TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor

	{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
	{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
	{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2vperm2f128 + 2vshufps
	{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2vperm2f128 + 2vshufps
	{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
	// + 2*por + vinsertf128
	{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
	// + 2*por + vinsertf128

	{ TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2vperm2f128 + 2vshufpd
	{ TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2vperm2f128 + 2vshufps
	{ TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2vperm2f128 + 2vshufpd
	{ TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2vperm2f128 + 2vshufps
	{ TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2vextractf128 + 8pshufb
	// + 4*por + vinsertf128
	{ TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2vextractf128 + 8pshufb
	// + 4*por + vinsertf128
	};

	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE41ShuffleTbl[] = {
	{ TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
	{ TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
	{ TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
	{ TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
	{ TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
	{ TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
	};

	if (ST->hasSSE41())
	if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSSE3ShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
	{ TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb

	{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
	{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb

	{ TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
	{ TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por

	{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
	{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb

	{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
	{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
	};

	if (ST->hasSSSE3())
	if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE2ShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
	{ TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
	{ TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
	{ TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
	{ TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd

	{ TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
	{ TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
	{ TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
	{ TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
	{ TTI::SK_Reverse, MVT::v16i8, 9 }, // 2pshuflw + 2pshufhw
	// + 2pshufd + 2unpck + packus

	{ TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
	{ TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
	{ TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
	{ TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
	{ TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por

	{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
	{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
	{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
	{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2pshuflw + 2pshufhw
	// + pshufd/unpck
	{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2pshuflw + 2pshufhw
	// + 2pshufd + 2unpck + 2*packus

	{ TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
	{ TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
	{ TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
	{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
	{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
	};

	if (ST->hasSSE2())
	if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	static const CostTblEntry SSE1ShuffleTbl[] = {
	{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
	{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
	{ TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
	{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
	{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
	};

	if (ST->hasSSE1())
	if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;

	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
	}

	int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
	const Instruction *I) {
	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	assert(ISD && "Invalid opcode");

	// FIXME: Need a better design of the cost table to handle non-simple types of
	// potential massive combinations (elem_num x src_type x dst_type).

	static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
	{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },

	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },

	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
	{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
	{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },

	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
	{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
	{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
	};

	// TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
	// 256-bit wide vectors.

	static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
	{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
	{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
	{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },

	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
	{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
	{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },

	// v16i1 -> v16i32 - load + broadcast
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },

	{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
	{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
	{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
	{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },

	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
	{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },

	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },
	{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },
	{ ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 },
	{ ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 },
	};

	static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
	{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
	{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },

	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
	{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
	{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },

	{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
	{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },

	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
	};

	static const TypeConversionCostTblEntry AVXConversionTbl[] = {
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
	{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
	{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },

	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
	{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
	{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },

	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },

	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
	// The generic code to compute the scalar overhead is currently broken.
	// Workaround this limitation by estimating the scalarization overhead
	// here. We have roughly 10 instructions per scalar element.
	// Multiply that by the vector width.
	// FIXME: remove that when PR19268 is fixed.
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 },
	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },

	{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
	// This node is expanded into scalarized operations but BasicTTI is overly
	// optimistic estimating its cost. It computes 3 per element (one
	// vector-extract, one scalar conversion and one vector-insert). The
	// problem is that the inserts form a read-modify-write chain so latency
	// should be factored in too. Inflating the cost per element by 1.
	{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },

	{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
	{ ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
	};

	static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },

	{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
	{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
	{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
	{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },

	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
	{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },

	};

	static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
	// These are somewhat magic numbers justified by looking at the output of
	// Intel's IACA, running some kernels and making sure when we take
	// legalization into account the throughput will be overestimated.
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },

	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },

	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },

	{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
	{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
	{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
	{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
	{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },

	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
	{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
	};

	std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
	std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);

	if (ST->hasSSE2() && !ST->hasAVX()) {
	if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
	LTDest.second, LTSrc.second))
	return LTSrc.first * Entry->Cost;
	}

	EVT SrcTy = TLI->getValueType(DL, Src);
	EVT DstTy = TLI->getValueType(DL, Dst);

	// The function getSimpleVT only handles simple value types.
	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
	return BaseT::getCastInstrCost(Opcode, Dst, Src);

	if (ST->hasDQI())
	if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;

	if (ST->hasAVX512())
	if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;

	if (ST->hasAVX2()) {
	if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;
	}

	if (ST->hasAVX()) {
	if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;
	}

	if (ST->hasSSE41()) {
	if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;
	}

	if (ST->hasSSE2()) {
	if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;
	}

	return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
	}

	int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,
	const Instruction *I) {
	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

	MVT MTy = LT.second;

	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	assert(ISD && "Invalid opcode");

	static const CostTblEntry SSE2CostTbl[] = {
	{ ISD::SETCC, MVT::v2i64, 8 },
	{ ISD::SETCC, MVT::v4i32, 1 },
	{ ISD::SETCC, MVT::v8i16, 1 },
	{ ISD::SETCC, MVT::v16i8, 1 },
	};

	static const CostTblEntry SSE42CostTbl[] = {
	{ ISD::SETCC, MVT::v2f64, 1 },
	{ ISD::SETCC, MVT::v4f32, 1 },
	{ ISD::SETCC, MVT::v2i64, 1 },
	};

	static const CostTblEntry AVX1CostTbl[] = {
	{ ISD::SETCC, MVT::v4f64, 1 },
	{ ISD::SETCC, MVT::v8f32, 1 },
	// AVX1 does not support 8-wide integer compare.
	{ ISD::SETCC, MVT::v4i64, 4 },
	{ ISD::SETCC, MVT::v8i32, 4 },
	{ ISD::SETCC, MVT::v16i16, 4 },
	{ ISD::SETCC, MVT::v32i8, 4 },
	};

	static const CostTblEntry AVX2CostTbl[] = {
	{ ISD::SETCC, MVT::v4i64, 1 },
	{ ISD::SETCC, MVT::v8i32, 1 },
	{ ISD::SETCC, MVT::v16i16, 1 },
	{ ISD::SETCC, MVT::v32i8, 1 },
	};

	static const CostTblEntry AVX512CostTbl[] = {
	{ ISD::SETCC, MVT::v8i64, 1 },
	{ ISD::SETCC, MVT::v16i32, 1 },
	{ ISD::SETCC, MVT::v8f64, 1 },
	{ ISD::SETCC, MVT::v16f32, 1 },
	};

	if (ST->hasAVX512())
	if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX2())
	if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE2())
	if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
	}

	unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }

	int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
	ArrayRef<Type *> Tys, FastMathFlags FMF,
	unsigned ScalarizationCostPassed) {
	// Costs should match the codegen from:
	// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
	// BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
	// CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
	// CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
	// CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
	static const CostTblEntry AVX512CDCostTbl[] = {
	{ ISD::CTLZ, MVT::v8i64, 1 },
	{ ISD::CTLZ, MVT::v16i32, 1 },
	{ ISD::CTLZ, MVT::v32i16, 8 },
	{ ISD::CTLZ, MVT::v64i8, 20 },
	{ ISD::CTLZ, MVT::v4i64, 1 },
	{ ISD::CTLZ, MVT::v8i32, 1 },
	{ ISD::CTLZ, MVT::v16i16, 4 },
	{ ISD::CTLZ, MVT::v32i8, 10 },
	{ ISD::CTLZ, MVT::v2i64, 1 },
	{ ISD::CTLZ, MVT::v4i32, 1 },
	{ ISD::CTLZ, MVT::v8i16, 4 },
	{ ISD::CTLZ, MVT::v16i8, 4 },
	};
	static const CostTblEntry AVX512BWCostTbl[] = {
	{ ISD::BITREVERSE, MVT::v8i64, 5 },
	{ ISD::BITREVERSE, MVT::v16i32, 5 },
	{ ISD::BITREVERSE, MVT::v32i16, 5 },
	{ ISD::BITREVERSE, MVT::v64i8, 5 },
	{ ISD::CTLZ, MVT::v8i64, 23 },
	{ ISD::CTLZ, MVT::v16i32, 22 },
	{ ISD::CTLZ, MVT::v32i16, 18 },
	{ ISD::CTLZ, MVT::v64i8, 17 },
	{ ISD::CTPOP, MVT::v8i64, 7 },
	{ ISD::CTPOP, MVT::v16i32, 11 },
	{ ISD::CTPOP, MVT::v32i16, 9 },
	{ ISD::CTPOP, MVT::v64i8, 6 },
	{ ISD::CTTZ, MVT::v8i64, 10 },
	{ ISD::CTTZ, MVT::v16i32, 14 },
	{ ISD::CTTZ, MVT::v32i16, 12 },
	{ ISD::CTTZ, MVT::v64i8, 9 },
	};
	static const CostTblEntry AVX512CostTbl[] = {
	{ ISD::BITREVERSE, MVT::v8i64, 36 },
	{ ISD::BITREVERSE, MVT::v16i32, 24 },
	{ ISD::CTLZ, MVT::v8i64, 29 },
	{ ISD::CTLZ, MVT::v16i32, 35 },
	{ ISD::CTPOP, MVT::v8i64, 16 },
	{ ISD::CTPOP, MVT::v16i32, 24 },
	{ ISD::CTTZ, MVT::v8i64, 20 },
	{ ISD::CTTZ, MVT::v16i32, 28 },
	};
	static const CostTblEntry XOPCostTbl[] = {
	{ ISD::BITREVERSE, MVT::v4i64, 4 },
	{ ISD::BITREVERSE, MVT::v8i32, 4 },
	{ ISD::BITREVERSE, MVT::v16i16, 4 },
	{ ISD::BITREVERSE, MVT::v32i8, 4 },
	{ ISD::BITREVERSE, MVT::v2i64, 1 },
	{ ISD::BITREVERSE, MVT::v4i32, 1 },
	{ ISD::BITREVERSE, MVT::v8i16, 1 },
	{ ISD::BITREVERSE, MVT::v16i8, 1 },
	{ ISD::BITREVERSE, MVT::i64, 3 },
	{ ISD::BITREVERSE, MVT::i32, 3 },
	{ ISD::BITREVERSE, MVT::i16, 3 },
	{ ISD::BITREVERSE, MVT::i8, 3 }
	};
	static const CostTblEntry AVX2CostTbl[] = {
	{ ISD::BITREVERSE, MVT::v4i64, 5 },
	{ ISD::BITREVERSE, MVT::v8i32, 5 },
	{ ISD::BITREVERSE, MVT::v16i16, 5 },
	{ ISD::BITREVERSE, MVT::v32i8, 5 },
	{ ISD::BSWAP, MVT::v4i64, 1 },
	{ ISD::BSWAP, MVT::v8i32, 1 },
	{ ISD::BSWAP, MVT::v16i16, 1 },
	{ ISD::CTLZ, MVT::v4i64, 23 },
	{ ISD::CTLZ, MVT::v8i32, 18 },
	{ ISD::CTLZ, MVT::v16i16, 14 },
	{ ISD::CTLZ, MVT::v32i8, 9 },
	{ ISD::CTPOP, MVT::v4i64, 7 },
	{ ISD::CTPOP, MVT::v8i32, 11 },
	{ ISD::CTPOP, MVT::v16i16, 9 },
	{ ISD::CTPOP, MVT::v32i8, 6 },
	{ ISD::CTTZ, MVT::v4i64, 10 },
	{ ISD::CTTZ, MVT::v8i32, 14 },
	{ ISD::CTTZ, MVT::v16i16, 12 },
	{ ISD::CTTZ, MVT::v32i8, 9 },
	{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
	{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
	{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
	{ ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
	{ ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
	{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
	};
	static const CostTblEntry AVX1CostTbl[] = {
	{ ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
	{ ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
	{ ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
	{ ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
	{ ISD::BSWAP, MVT::v4i64, 4 },
	{ ISD::BSWAP, MVT::v8i32, 4 },
	{ ISD::BSWAP, MVT::v16i16, 4 },
	{ ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
	{ ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
	{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
	{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
	{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
	{ ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
	{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
	{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
	};
	static const CostTblEntry SSE42CostTbl[] = {
	{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
	{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
	};
	static const CostTblEntry SSSE3CostTbl[] = {
	{ ISD::BITREVERSE, MVT::v2i64, 5 },
	{ ISD::BITREVERSE, MVT::v4i32, 5 },
	{ ISD::BITREVERSE, MVT::v8i16, 5 },
	{ ISD::BITREVERSE, MVT::v16i8, 5 },
	{ ISD::BSWAP, MVT::v2i64, 1 },
	{ ISD::BSWAP, MVT::v4i32, 1 },
	{ ISD::BSWAP, MVT::v8i16, 1 },
	{ ISD::CTLZ, MVT::v2i64, 23 },
	{ ISD::CTLZ, MVT::v4i32, 18 },
	{ ISD::CTLZ, MVT::v8i16, 14 },
	{ ISD::CTLZ, MVT::v16i8, 9 },
	{ ISD::CTPOP, MVT::v2i64, 7 },
	{ ISD::CTPOP, MVT::v4i32, 11 },
	{ ISD::CTPOP, MVT::v8i16, 9 },
	{ ISD::CTPOP, MVT::v16i8, 6 },
	{ ISD::CTTZ, MVT::v2i64, 10 },
	{ ISD::CTTZ, MVT::v4i32, 14 },
	{ ISD::CTTZ, MVT::v8i16, 12 },
	{ ISD::CTTZ, MVT::v16i8, 9 }
	};
	static const CostTblEntry SSE2CostTbl[] = {
	{ ISD::BITREVERSE, MVT::v2i64, 29 },
	{ ISD::BITREVERSE, MVT::v4i32, 27 },
	{ ISD::BITREVERSE, MVT::v8i16, 27 },
	{ ISD::BITREVERSE, MVT::v16i8, 20 },
	{ ISD::BSWAP, MVT::v2i64, 7 },
	{ ISD::BSWAP, MVT::v4i32, 7 },
	{ ISD::BSWAP, MVT::v8i16, 7 },
	{ ISD::CTLZ, MVT::v2i64, 25 },
	{ ISD::CTLZ, MVT::v4i32, 26 },
	{ ISD::CTLZ, MVT::v8i16, 20 },
	{ ISD::CTLZ, MVT::v16i8, 17 },
	{ ISD::CTPOP, MVT::v2i64, 12 },
	{ ISD::CTPOP, MVT::v4i32, 15 },
	{ ISD::CTPOP, MVT::v8i16, 13 },
	{ ISD::CTPOP, MVT::v16i8, 10 },
	{ ISD::CTTZ, MVT::v2i64, 14 },
	{ ISD::CTTZ, MVT::v4i32, 18 },
	{ ISD::CTTZ, MVT::v8i16, 16 },
	{ ISD::CTTZ, MVT::v16i8, 13 },
	{ ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
	{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
	};
	static const CostTblEntry SSE1CostTbl[] = {
	{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
	{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
	};
	static const CostTblEntry X64CostTbl[] = { // 64-bit targets
	{ ISD::BITREVERSE, MVT::i64, 14 }
	};
	static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
	{ ISD::BITREVERSE, MVT::i32, 14 },
	{ ISD::BITREVERSE, MVT::i16, 14 },
	{ ISD::BITREVERSE, MVT::i8, 11 }
	};

	unsigned ISD = ISD::DELETED_NODE;
	switch (IID) {
	default:
	break;
	case Intrinsic::bitreverse:
	ISD = ISD::BITREVERSE;
	break;
	case Intrinsic::bswap:
	ISD = ISD::BSWAP;
	break;
	case Intrinsic::ctlz:
	ISD = ISD::CTLZ;
	break;
	case Intrinsic::ctpop:
	ISD = ISD::CTPOP;
	break;
	case Intrinsic::cttz:
	ISD = ISD::CTTZ;
	break;
	case Intrinsic::sqrt:
	ISD = ISD::FSQRT;
	break;
	}

	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
	MVT MTy = LT.second;

	// Attempt to lookup cost.
	if (ST->hasCDI())
	if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasBWI())
	if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX512())
	if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasXOP())
	if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX2())
	if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSSE3())
	if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE2())
	if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE1())
	if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->is64Bit())
	if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
	return LT.first * Entry->Cost;

	return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
	}

	int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
	ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
	return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
	}

	int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
	assert(Val->isVectorTy() && "This must be a vector type");

	Type *ScalarType = Val->getScalarType();

	if (Index != -1U) {
	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);

	// This type is legalized to a scalar type.
	if (!LT.second.isVector())
	return 0;

	// The type may be split. Normalize the index to the new type.
	unsigned Width = LT.second.getVectorNumElements();
	Index = Index % Width;

	// Floating point scalars are already located in index #0.
	if (ScalarType->isFloatingPointTy() && Index == 0)
	return 0;
	}

	// Add to the base cost if we know that the extracted element of a vector is
	// destined to be moved to and used in the integer register file.
	int RegisterFileMoveCost = 0;
	if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
	RegisterFileMoveCost = 1;

	return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
	}

	int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
	unsigned AddressSpace, const Instruction *I) {
	// Handle non-power-of-two vectors such as <3 x float>
	if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
	unsigned NumElem = VTy->getVectorNumElements();

	// Handle a few common cases:
	// <3 x float>
	if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
	// Cost = 64 bit store + extract + 32 bit store.
	return 3;

	// <3 x double>
	if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
	// Cost = 128 bit store + unpack + 64 bit store.
	return 3;

	// Assume that all other non-power-of-two numbers are scalarized.
	if (!isPowerOf2_32(NumElem)) {
	int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
	AddressSpace);
	int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
	Opcode == Instruction::Store);
	return NumElem * Cost + SplitCost;
	}
	}

	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
	assert((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
	"Invalid Opcode");

	// Each load/store unit costs 1.
	int Cost = LT.first * 1;

	// This isn't exactly right. We're using slow unaligned 32-byte accesses as a
	// proxy for a double-pumped AVX memory interface such as on Sandybridge.
	if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
	Cost *= 2;

	return Cost;
	}

	int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
	unsigned Alignment,
	unsigned AddressSpace) {
	VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
	if (!SrcVTy)
	// To calculate scalar take the regular cost, without mask
	return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);

	unsigned NumElem = SrcVTy->getVectorNumElements();
	VectorType *MaskTy =
	VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
	if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) \|\|
	(Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) \|\|
	!isPowerOf2_32(NumElem)) {
	// Scalarization
	int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
	int ScalarCompareCost = getCmpSelInstrCost(
	Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
	int BranchCost = getCFInstrCost(Instruction::Br);
	int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);

	int ValueSplitCost = getScalarizationOverhead(
	SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
	int MemopCost =
	NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
	Alignment, AddressSpace);
	return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
	}

	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
	auto VT = TLI->getValueType(DL, SrcVTy);
	int Cost = 0;
	if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
	LT.second.getVectorNumElements() == NumElem)
	// Promotion requires expand/truncate for data and a shuffle for mask.
	Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
	getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);

	else if (LT.second.getVectorNumElements() > NumElem) {
	VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
	LT.second.getVectorNumElements());
	// Expanding requires fill mask with zeroes
	Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
	}
	if (!ST->hasAVX512())
	return Cost + LT.first*4; // Each maskmov costs 4

	// AVX-512 masked load/store is cheapper
	return Cost+LT.first;
	}

	int X86TTIImpl::getAddressComputationCost(Type Ty, ScalarEvolution SE,
	const SCEV *Ptr) {
	// Address computations in vectorized code with non-consecutive addresses will
	// likely result in more instructions compared to scalar code where the
	// computation can more often be merged into the index mode. The resulting
	// extra micro-ops can significantly decrease throughput.
	unsigned NumVectorInstToHideOverhead = 10;

	// Cost modeling of Strided Access Computation is hidden by the indexing
	// modes of X86 regardless of the stride value. We dont believe that there
	// is a difference between constant strided access in gerenal and constant
	// strided value which is less than or equal to 64.
	// Even in the case of (loop invariant) stride whose value is not known at
	// compile time, the address computation will not incur more than one extra
	// ADD instruction.
	if (Ty->isVectorTy() && SE) {
	if (!BaseT::isStridedAccess(Ptr))
	return NumVectorInstToHideOverhead;
	if (!BaseT::getConstantStrideStep(SE, Ptr))
	return 1;
	}

	return BaseT::getAddressComputationCost(Ty, SE, Ptr);
	}

	int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
	bool IsPairwise) {

	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

	MVT MTy = LT.second;

	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	assert(ISD && "Invalid opcode");

	// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
	// and make it as the cost.

	static const CostTblEntry SSE42CostTblPairWise[] = {
	{ ISD::FADD, MVT::v2f64, 2 },
	{ ISD::FADD, MVT::v4f32, 4 },
	{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
	{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
	{ ISD::ADD, MVT::v8i16, 5 },
	};

	static const CostTblEntry AVX1CostTblPairWise[] = {
	{ ISD::FADD, MVT::v4f32, 4 },
	{ ISD::FADD, MVT::v4f64, 5 },
	{ ISD::FADD, MVT::v8f32, 7 },
	{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
	{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
	{ ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
	{ ISD::ADD, MVT::v8i16, 5 },
	{ ISD::ADD, MVT::v8i32, 5 },
	};

	static const CostTblEntry SSE42CostTblNoPairWise[] = {
	{ ISD::FADD, MVT::v2f64, 2 },
	{ ISD::FADD, MVT::v4f32, 4 },
	{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
	{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
	{ ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
	};

	static const CostTblEntry AVX1CostTblNoPairWise[] = {
	{ ISD::FADD, MVT::v4f32, 3 },
	{ ISD::FADD, MVT::v4f64, 3 },
	{ ISD::FADD, MVT::v8f32, 4 },
	{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
	{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
	{ ISD::ADD, MVT::v4i64, 3 },
	{ ISD::ADD, MVT::v8i16, 4 },
	{ ISD::ADD, MVT::v8i32, 5 },
	};

	if (IsPairwise) {
	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
	return LT.first * Entry->Cost;
	} else {
	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
	return LT.first * Entry->Cost;
	}

	return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
	}

	int X86TTIImpl::getMinMaxReductionCost(Type ValTy, Type CondTy,
	bool IsPairwise, bool IsUnsigned) {
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

	MVT MTy = LT.second;

	int ISD;
	if (ValTy->isIntOrIntVectorTy()) {
	ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
	} else {
	assert(ValTy->isFPOrFPVectorTy() &&
	"Expected float point or integer vector type.");
	ISD = ISD::FMINNUM;
	}

	// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
	// and make it as the cost.

	static const CostTblEntry SSE42CostTblPairWise[] = {
	{ISD::FMINNUM, MVT::v2f64, 3},
	{ISD::FMINNUM, MVT::v4f32, 2},
	{ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
	{ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
	{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
	{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
	{ISD::SMIN, MVT::v8i16, 2},
	{ISD::UMIN, MVT::v8i16, 2},
	};

	static const CostTblEntry AVX1CostTblPairWise[] = {
	{ISD::FMINNUM, MVT::v4f32, 1},
	{ISD::FMINNUM, MVT::v4f64, 1},
	{ISD::FMINNUM, MVT::v8f32, 2},
	{ISD::SMIN, MVT::v2i64, 3},
	{ISD::UMIN, MVT::v2i64, 3},
	{ISD::SMIN, MVT::v4i32, 1},
	{ISD::UMIN, MVT::v4i32, 1},
	{ISD::SMIN, MVT::v8i16, 1},
	{ISD::UMIN, MVT::v8i16, 1},
	{ISD::SMIN, MVT::v8i32, 3},
	{ISD::UMIN, MVT::v8i32, 3},
	};

	static const CostTblEntry AVX2CostTblPairWise[] = {
	{ISD::SMIN, MVT::v4i64, 2},
	{ISD::UMIN, MVT::v4i64, 2},
	{ISD::SMIN, MVT::v8i32, 1},
	{ISD::UMIN, MVT::v8i32, 1},
	{ISD::SMIN, MVT::v16i16, 1},
	{ISD::UMIN, MVT::v16i16, 1},
	{ISD::SMIN, MVT::v32i8, 2},
	{ISD::UMIN, MVT::v32i8, 2},
	};

	static const CostTblEntry AVX512CostTblPairWise[] = {
	{ISD::FMINNUM, MVT::v8f64, 1},
	{ISD::FMINNUM, MVT::v16f32, 2},
	{ISD::SMIN, MVT::v8i64, 2},
	{ISD::UMIN, MVT::v8i64, 2},
	{ISD::SMIN, MVT::v16i32, 1},
	{ISD::UMIN, MVT::v16i32, 1},
	};

	static const CostTblEntry SSE42CostTblNoPairWise[] = {
	{ISD::FMINNUM, MVT::v2f64, 3},
	{ISD::FMINNUM, MVT::v4f32, 3},
	{ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
	{ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
	{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
	{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
	{ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
	{ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
	};

	static const CostTblEntry AVX1CostTblNoPairWise[] = {
	{ISD::FMINNUM, MVT::v4f32, 1},
	{ISD::FMINNUM, MVT::v4f64, 1},
	{ISD::FMINNUM, MVT::v8f32, 1},
	{ISD::SMIN, MVT::v2i64, 3},
	{ISD::UMIN, MVT::v2i64, 3},
	{ISD::SMIN, MVT::v4i32, 1},
	{ISD::UMIN, MVT::v4i32, 1},
	{ISD::SMIN, MVT::v8i16, 1},
	{ISD::UMIN, MVT::v8i16, 1},
	{ISD::SMIN, MVT::v8i32, 2},
	{ISD::UMIN, MVT::v8i32, 2},
	};

	static const CostTblEntry AVX2CostTblNoPairWise[] = {
	{ISD::SMIN, MVT::v4i64, 1},
	{ISD::UMIN, MVT::v4i64, 1},
	{ISD::SMIN, MVT::v8i32, 1},
	{ISD::UMIN, MVT::v8i32, 1},
	{ISD::SMIN, MVT::v16i16, 1},
	{ISD::UMIN, MVT::v16i16, 1},
	{ISD::SMIN, MVT::v32i8, 1},
	{ISD::UMIN, MVT::v32i8, 1},
	};

	static const CostTblEntry AVX512CostTblNoPairWise[] = {
	{ISD::FMINNUM, MVT::v8f64, 1},
	{ISD::FMINNUM, MVT::v16f32, 2},
	{ISD::SMIN, MVT::v8i64, 1},
	{ISD::UMIN, MVT::v8i64, 1},
	{ISD::SMIN, MVT::v16i32, 1},
	{ISD::UMIN, MVT::v16i32, 1},
	};

	if (IsPairwise) {
	if (ST->hasAVX512())
	if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX2())
	if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
	return LT.first * Entry->Cost;
	} else {
	if (ST->hasAVX512())
	if (const auto *Entry =
	CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX2())
	if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasAVX())
	if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
	return LT.first * Entry->Cost;

	if (ST->hasSSE42())
	if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
	return LT.first * Entry->Cost;
	}

	return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
	}

	/// \brief Calculate the cost of materializing a 64-bit value. This helper
	/// method might only calculate a fraction of a larger immediate. Therefore it
	/// is valid to return a cost of ZERO.
	int X86TTIImpl::getIntImmCost(int64_t Val) {
	if (Val == 0)
	return TTI::TCC_Free;

	if (isInt<32>(Val))
	return TTI::TCC_Basic;

	return 2 * TTI::TCC_Basic;
	}

	int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return ~0U;

	// Never hoist constants larger than 128bit, because this might lead to
	// incorrect code generation or assertions in codegen.
	// Fixme: Create a cost model for types larger than i128 once the codegen
	// issues have been fixed.
	if (BitSize > 128)
	return TTI::TCC_Free;

	if (Imm == 0)
	return TTI::TCC_Free;

	// Sign-extend all constants to a multiple of 64-bit.
	APInt ImmVal = Imm;
	if (BitSize & 0x3f)
	ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);

	// Split the constant into 64-bit chunks and calculate the cost for each
	// chunk.
	int Cost = 0;
	for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
	APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
	int64_t Val = Tmp.getSExtValue();
	Cost += getIntImmCost(Val);
	}
	// We need at least one instruction to materialize the constant.
	return std::max(1, Cost);
	}

	int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
	Type *Ty) {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	// There is no cost model for constants with a bit size of 0. Return TCC_Free
	// here, so that constant hoisting will ignore this constant.
	if (BitSize == 0)
	return TTI::TCC_Free;

	unsigned ImmIdx = ~0U;
	switch (Opcode) {
	default:
	return TTI::TCC_Free;
	case Instruction::GetElementPtr:
	// Always hoist the base address of a GetElementPtr. This prevents the
	// creation of new constants for every base constant that gets constant
	// folded with the offset.
	if (Idx == 0)
	return 2 * TTI::TCC_Basic;
	return TTI::TCC_Free;
	case Instruction::Store:
	ImmIdx = 0;
	break;
	case Instruction::ICmp:
	// This is an imperfect hack to prevent constant hoisting of
	// compares that might be trying to check if a 64-bit value fits in
	// 32-bits. The backend can optimize these cases using a right shift by 32.
	// Ideally we would check the compare predicate here. There also other
	// similar immediates the backend can use shifts for.
	if (Idx == 1 && Imm.getBitWidth() == 64) {
	uint64_t ImmVal = Imm.getZExtValue();
	if (ImmVal == 0x100000000ULL \|\| ImmVal == 0xffffffff)
	return TTI::TCC_Free;
	}
	ImmIdx = 1;
	break;
	case Instruction::And:
	// We support 64-bit ANDs with immediates with 32-bits of leading zeroes
	// by using a 32-bit operation with implicit zero extension. Detect such
	// immediates here as the normal path expects bit 31 to be sign extended.
	if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
	return TTI::TCC_Free;
	LLVM_FALLTHROUGH;
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Mul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::Or:
	case Instruction::Xor:
	ImmIdx = 1;
	break;
	// Always return TCC_Free for the shift value of a shift instruction.
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	if (Idx == 1)
	return TTI::TCC_Free;
	break;
	case Instruction::Trunc:
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::IntToPtr:
	case Instruction::PtrToInt:
	case Instruction::BitCast:
	case Instruction::PHI:
	case Instruction::Call:
	case Instruction::Select:
	case Instruction::Ret:
	case Instruction::Load:
	break;
	}

	if (Idx == ImmIdx) {
	int NumConstants = (BitSize + 63) / 64;
	int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
	return (Cost <= NumConstants * TTI::TCC_Basic)
	? static_cast<int>(TTI::TCC_Free)
	: Cost;
	}

	return X86TTIImpl::getIntImmCost(Imm, Ty);
	}

	int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
	Type *Ty) {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	// There is no cost model for constants with a bit size of 0. Return TCC_Free
	// here, so that constant hoisting will ignore this constant.
	if (BitSize == 0)
	return TTI::TCC_Free;

	switch (IID) {
	default:
	return TTI::TCC_Free;
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::usub_with_overflow:
	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow:
	if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
	return TTI::TCC_Free;
	break;
	case Intrinsic::experimental_stackmap:
	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
	return TTI::TCC_Free;
	break;
	case Intrinsic::experimental_patchpoint_void:
	case Intrinsic::experimental_patchpoint_i64:
	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
	return TTI::TCC_Free;
	break;
	}
	return X86TTIImpl::getIntImmCost(Imm, Ty);
	}

	unsigned X86TTIImpl::getUserCost(const User *U,
	ArrayRef<const Value *> Operands) {
	if (isa<StoreInst>(U)) {
	Value *Ptr = U->getOperand(1);
	// Store instruction with index and scale costs 2 Uops.
	// Check the preceding GEP to identify non-const indices.
	if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
	if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
	return TTI::TCC_Basic * 2;
	}
	return TTI::TCC_Basic;
	}
	return BaseT::getUserCost(U, Operands);
	}

	// Return an average cost of Gather / Scatter instruction, maybe improved later
	int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type SrcVTy, Value Ptr,
	unsigned Alignment, unsigned AddressSpace) {

	assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
	unsigned VF = SrcVTy->getVectorNumElements();

	// Try to reduce index size from 64 bit (default for GEP)
	// to 32. It is essential for VF 16. If the index can't be reduced to 32, the
	// operation will use 16 x 64 indices which do not fit in a zmm and needs
	// to split. Also check that the base pointer is the same for all lanes,
	// and that there's at most one variable index.
	auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
	unsigned IndexSize = DL.getPointerSizeInBits();
	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
	if (IndexSize < 64 \|\| !GEP)
	return IndexSize;

	unsigned NumOfVarIndices = 0;
	Value *Ptrs = GEP->getPointerOperand();
	if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
	return IndexSize;
	for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
	if (isa<Constant>(GEP->getOperand(i)))
	continue;
	Type *IndxTy = GEP->getOperand(i)->getType();
	if (IndxTy->isVectorTy())
	IndxTy = IndxTy->getVectorElementType();
	if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
	!isa<SExtInst>(GEP->getOperand(i))) \|\|
	++NumOfVarIndices > 1)
	return IndexSize; // 64
	}
	return (unsigned)32;
	};


	// Trying to reduce IndexSize to 32 bits for vector 16.
	// By default the IndexSize is equal to pointer size.
	unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
	? getIndexSizeInBits(Ptr, DL)
	: DL.getPointerSizeInBits();

	Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
	IndexSize), VF);
	std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
	std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
	int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
	if (SplitFactor > 1) {
	// Handle splitting of vector of pointers
	Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
	return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
	AddressSpace);
	}

	// The gather / scatter cost is given by Intel architects. It is a rough
	// number since we are looking at one instruction in a time.
	const int GSOverhead = (Opcode == Instruction::Load)
	? ST->getGatherOverhead()
	: ST->getScatterOverhead();
	return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
	Alignment, AddressSpace);
	}

	/// Return the cost of full scalarization of gather / scatter operation.
	///
	/// Opcode - Load or Store instruction.
	/// SrcVTy - The type of the data vector that should be gathered or scattered.
	/// VariableMask - The mask is non-constant at compile time.
	/// Alignment - Alignment for one element.
	/// AddressSpace - pointer[s] address space.
	///
	int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
	bool VariableMask, unsigned Alignment,
	unsigned AddressSpace) {
	unsigned VF = SrcVTy->getVectorNumElements();

	int MaskUnpackCost = 0;
	if (VariableMask) {
	VectorType *MaskTy =
	VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
	MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
	int ScalarCompareCost =
	getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
	nullptr);
	int BranchCost = getCFInstrCost(Instruction::Br);
	MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
	}

	// The cost of the scalar loads/stores.
	int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
	Alignment, AddressSpace);

	int InsertExtractCost = 0;
	if (Opcode == Instruction::Load)
	for (unsigned i = 0; i < VF; ++i)
	// Add the cost of inserting each scalar load into the vector
	InsertExtractCost +=
	getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
	else
	for (unsigned i = 0; i < VF; ++i)
	// Add the cost of extracting each element out of the data vector
	InsertExtractCost +=
	getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);

	return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
	}

	/// Calculate the cost of Gather / Scatter operation
	int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
	Value *Ptr, bool VariableMask,
	unsigned Alignment) {
	assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
	unsigned VF = SrcVTy->getVectorNumElements();
	PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
	if (!PtrTy && Ptr->getType()->isVectorTy())
	PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
	assert(PtrTy && "Unexpected type for Ptr argument");
	unsigned AddressSpace = PtrTy->getAddressSpace();

	bool Scalarize = false;
	if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) \|\|
	(Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
	Scalarize = true;
	// Gather / Scatter for vector 2 is not profitable on KNL / SKX
	// Vector-4 of gather/scatter instruction does not exist on KNL.
	// We can extend it to 8 elements, but zeroing upper bits of
	// the mask vector will add more instructions. Right now we give the scalar
	// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
	// is better in the VariableMask case.
	if (ST->hasAVX512() && (VF == 2 \|\| (VF == 4 && !ST->hasVLX())))
	Scalarize = true;

	if (Scalarize)
	return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
	AddressSpace);

	return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
	}

	bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
	TargetTransformInfo::LSRCost &C2) {
	// X86 specific here are "instruction number 1st priority".
	return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
	C1.NumIVMuls, C1.NumBaseAdds,
	C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
	std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
	C2.NumIVMuls, C2.NumBaseAdds,
	C2.ScaleCost, C2.ImmCost, C2.SetupCost);
	}

	bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
	// The backend can't handle a single element vector.
	if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
	return false;
	Type *ScalarTy = DataTy->getScalarType();
	int DataWidth = isa<PointerType>(ScalarTy) ?
	DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();

	return ((DataWidth == 32 \|\| DataWidth == 64) && ST->hasAVX()) \|\|
	((DataWidth == 8 \|\| DataWidth == 16) && ST->hasBWI());
	}

	bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
	return isLegalMaskedLoad(DataType);
	}

	bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
	// This function is called now in two cases: from the Loop Vectorizer
	// and from the Scalarizer.
	// When the Loop Vectorizer asks about legality of the feature,
	// the vectorization factor is not calculated yet. The Loop Vectorizer
	// sends a scalar type and the decision is based on the width of the
	// scalar element.
	// Later on, the cost model will estimate usage this intrinsic based on
	// the vector type.
	// The Scalarizer asks again about legality. It sends a vector type.
	// In this case we can reject non-power-of-2 vectors.
	// We also reject single element vectors as the type legalizer can't
	// scalarize it.
	if (isa<VectorType>(DataTy)) {
	unsigned NumElts = DataTy->getVectorNumElements();
	if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))
	return false;
	}
	Type *ScalarTy = DataTy->getScalarType();
	int DataWidth = isa<PointerType>(ScalarTy) ?
	DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();

	// Some CPUs have better gather performance than others.
	// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
	// enable gather with a -march.
	return (DataWidth == 32 \|\| DataWidth == 64) &&
	(ST->hasAVX512() \|\| (ST->hasFastGather() && ST->hasAVX2()));
	}

	bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
	// AVX2 doesn't support scatter
	if (!ST->hasAVX512())
	return false;
	return isLegalMaskedGather(DataType);
	}

	bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
	EVT VT = TLI->getValueType(DL, DataType);
	return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
	}

	bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
	return false;
	}

	bool X86TTIImpl::areInlineCompatible(const Function *Caller,
	const Function *Callee) const {
	const TargetMachine &TM = getTLI()->getTargetMachine();

	// Work this as a subsetting of subtarget features.
	const FeatureBitset &CallerBits =
	TM.getSubtargetImpl(*Caller)->getFeatureBits();
	const FeatureBitset &CalleeBits =
	TM.getSubtargetImpl(*Callee)->getFeatureBits();

	// FIXME: This is likely too limiting as it will include subtarget features
	// that we might not care about for inlining, but it is conservatively
	// correct.
	return (CallerBits & CalleeBits) == CalleeBits;
	}

	const X86TTIImpl::TTI::MemCmpExpansionOptions *
	X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
	// Only enable vector loads for equality comparison.
	// Right now the vector version is not as fast, see #33329.
	static const auto ThreeWayOptions = [this]() {
	TTI::MemCmpExpansionOptions Options;
	if (ST->is64Bit()) {
	Options.LoadSizes.push_back(8);
	}
	Options.LoadSizes.push_back(4);
	Options.LoadSizes.push_back(2);
	Options.LoadSizes.push_back(1);
	return Options;
	}();
	static const auto EqZeroOptions = [this]() {
	TTI::MemCmpExpansionOptions Options;
	// TODO: enable AVX512 when the DAG is ready.
	// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
	if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
	if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
	if (ST->is64Bit()) {
	Options.LoadSizes.push_back(8);
	}
	Options.LoadSizes.push_back(4);
	Options.LoadSizes.push_back(2);
	Options.LoadSizes.push_back(1);
	return Options;
	}();
	return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
	}

	bool X86TTIImpl::enableInterleavedAccessVectorization() {
	// TODO: We expect this to be beneficial regardless of arch,
	// but there are currently some unexplained performance artifacts on Atom.
	// As a temporary solution, disable on Atom.
	return !(ST->isAtom());
	}

	// Get estimation for interleaved load/store operations for AVX2.
	// \p Factor is the interleaved-access factor (stride) - number of
	// (interleaved) elements in the group.
	// \p Indices contains the indices for a strided load: when the
	// interleaved load has gaps they indicate which elements are used.
	// If Indices is empty (or if the number of indices is equal to the size
	// of the interleaved-access as given in \p Factor) the access has no gaps.
	//
	// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
	// computing the cost using a generic formula as a function of generic
	// shuffles. We therefore use a lookup table instead, filled according to
	// the instruction sequences that codegen currently generates.
	int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
	unsigned Factor,
	ArrayRef<unsigned> Indices,
	unsigned Alignment,
	unsigned AddressSpace) {

	// We currently Support only fully-interleaved groups, with no gaps.
	// TODO: Support also strided loads (interleaved-groups with gaps).
	if (Indices.size() && Indices.size() != Factor)
	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);

	// VecTy for interleave memop is <VF*Factor x Elt>.
	// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
	// VecTy = <12 x i32>.
	MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;

	// This function can be called with VecTy=<6xi128>, Factor=3, in which case
	// the VF=2, while v2i128 is an unsupported MVT vector type
	// (see MachineValueType.h::getVectorVT()).
	if (!LegalVT.isVector())
	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);

	unsigned VF = VecTy->getVectorNumElements() / Factor;
	Type *ScalarTy = VecTy->getVectorElementType();

	// Calculate the number of memory operations (NumOfMemOps), required
	// for load/store the VecTy.
	unsigned VecTySize = DL.getTypeStoreSize(VecTy);
	unsigned LegalVTSize = LegalVT.getStoreSize();
	unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;

	// Get the cost of one memory operation.
	Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
	LegalVT.getVectorNumElements());
	unsigned MemOpCost =
	getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);

	VectorType *VT = VectorType::get(ScalarTy, VF);
	EVT ETy = TLI->getValueType(DL, VT);
	if (!ETy.isSimple())
	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);

	// TODO: Complete for other data-types and strides.
	// Each combination of Stride, ElementTy and VF results in a different
	// sequence; The cost tables are therefore accessed with:
	// Factor (stride) and VectorType=VFxElemType.
	// The Cost accounts only for the shuffle sequence;
	// The cost of the loads/stores is accounted for separately.
	//
	static const CostTblEntry AVX2InterleavedLoadTbl[] = {
	{ 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
	{ 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64

	{ 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
	{ 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
	{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
	{ 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
	{ 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
	{ 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32

	{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
	{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
	{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
	{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
	{ 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8

	{ 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
	};

	static const CostTblEntry AVX2InterleavedStoreTbl[] = {
	{ 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
	{ 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)

	{ 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
	{ 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
	{ 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
	{ 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
	{ 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)

	{ 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
	{ 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
	{ 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
	{ 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
	{ 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
	};

	if (Opcode == Instruction::Load) {
	if (const auto *Entry =
	CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
	return NumOfMemOps * MemOpCost + Entry->Cost;
	} else {
	assert(Opcode == Instruction::Store &&
	"Expected Store Instruction at this point");
	if (const auto *Entry =
	CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
	return NumOfMemOps * MemOpCost + Entry->Cost;
	}

	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);
	}

	// Get estimation for interleaved load/store operations and strided load.
	// \p Indices contains indices for strided load.
	// \p Factor - the factor of interleaving.
	// AVX-512 provides 3-src shuffles that significantly reduces the cost.
	int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
	unsigned Factor,
	ArrayRef<unsigned> Indices,
	unsigned Alignment,
	unsigned AddressSpace) {

	// VecTy for interleave memop is <VF*Factor x Elt>.
	// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
	// VecTy = <12 x i32>.

	// Calculate the number of memory operations (NumOfMemOps), required
	// for load/store the VecTy.
	MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
	unsigned VecTySize = DL.getTypeStoreSize(VecTy);
	unsigned LegalVTSize = LegalVT.getStoreSize();
	unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;

	// Get the cost of one memory operation.
	Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
	LegalVT.getVectorNumElements());
	unsigned MemOpCost =
	getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);

	unsigned VF = VecTy->getVectorNumElements() / Factor;
	MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);

	if (Opcode == Instruction::Load) {
	// The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
	// contain the cost of the optimized shuffle sequence that the
	// X86InterleavedAccess pass will generate.
	// The cost of loads and stores are computed separately from the table.

	// X86InterleavedAccess support only the following interleaved-access group.
	static const CostTblEntry AVX512InterleavedLoadTbl[] = {
	{3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
	{3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
	{3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
	};

	if (const auto *Entry =
	CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
	return NumOfMemOps * MemOpCost + Entry->Cost;
	//If an entry does not exist, fallback to the default implementation.

	// Kind of shuffle depends on number of loaded values.
	// If we load the entire data in one register, we can use a 1-src shuffle.
	// Otherwise, we'll merge 2 sources in each operation.
	TTI::ShuffleKind ShuffleKind =
	(NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;

	unsigned ShuffleCost =
	getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);

	unsigned NumOfLoadsInInterleaveGrp =
	Indices.size() ? Indices.size() : Factor;
	Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
	VecTy->getVectorNumElements() / Factor);
	unsigned NumOfResults =
	getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
	NumOfLoadsInInterleaveGrp;

	// About a half of the loads may be folded in shuffles when we have only
	// one result. If we have more than one result, we do not fold loads at all.
	unsigned NumOfUnfoldedLoads =
	NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;

	// Get a number of shuffle operations per result.
	unsigned NumOfShufflesPerResult =
	std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));

	// The SK_MergeTwoSrc shuffle clobbers one of src operands.
	// When we have more than one destination, we need additional instructions
	// to keep sources.
	unsigned NumOfMoves = 0;
	if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
	NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;

	int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
	NumOfUnfoldedLoads * MemOpCost + NumOfMoves;

	return Cost;
	}

	// Store.
	assert(Opcode == Instruction::Store &&
	"Expected Store Instruction at this point");
	// X86InterleavedAccess support only the following interleaved-access group.
	static const CostTblEntry AVX512InterleavedStoreTbl[] = {
	{3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
	{3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
	{3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)

	{4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
	{4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
	{4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
	{4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
	};

	if (const auto *Entry =
	CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
	return NumOfMemOps * MemOpCost + Entry->Cost;
	//If an entry does not exist, fallback to the default implementation.

	// There is no strided stores meanwhile. And store can't be folded in
	// shuffle.
	unsigned NumOfSources = Factor; // The number of values to be merged.
	unsigned ShuffleCost =
	getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
	unsigned NumOfShufflesPerStore = NumOfSources - 1;

	// The SK_MergeTwoSrc shuffle clobbers one of src operands.
	// We need additional instructions to keep sources.
	unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
	int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
	NumOfMoves;
	return Cost;
	}

	int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
	unsigned Factor,
	ArrayRef<unsigned> Indices,
	unsigned Alignment,
	unsigned AddressSpace) {
	auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
	Type *EltTy = VecTy->getVectorElementType();
	if (EltTy->isFloatTy() \|\| EltTy->isDoubleTy() \|\| EltTy->isIntegerTy(64) \|\|
	EltTy->isIntegerTy(32) \|\| EltTy->isPointerTy())
	return true;
	if (EltTy->isIntegerTy(16) \|\| EltTy->isIntegerTy(8))
	return HasBW;
	return false;
	};
	if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
	return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);
	if (ST->hasAVX2())
	return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);

	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace);
	}
	Index: vendor/llvm/dist-release_60/lib/Transforms/Scalar/GVNHoist.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Transforms/Scalar/GVNHoist.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Transforms/Scalar/GVNHoist.cpp (revision 328362)
	@@ -1,1207 +1,1207 @@
	//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass hoists expressions from branches to a common dominator. It uses
	// GVN (global value numbering) to discover expressions computing the same
	// values. The primary goals of code-hoisting are:
	// 1. To reduce the code size.
	// 2. In some cases reduce critical path (by exposing more ILP).
	//
	// The algorithm factors out the reachability of values such that multiple
	// queries to find reachability of values are fast. This is based on finding the
	// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
	// are basically the dominance-frontiers in the inverse graph. So we introduce a
	// data structure (CHI nodes) to keep track of values flowing out of a basic
	// block. We only do this for values with multiple occurrences in the function
	// as they are the potential hoistable candidates. This approach allows us to
	// hoist instructions to a basic block with more than two successors, as well as
	// deal with infinite loops in a trivial way.
	//
	// Limitations: This pass does not hoist fully redundant expressions because
	// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
	// and after gvn-pre because gvn-pre creates opportunities for more instructions
	// to be hoisted.
	//
	// Hoisting may affect the performance in some cases. To mitigate that, hoisting
	// is disabled in the following cases.
	// 1. Scalars across calls.
	// 2. geps when corresponding load/store cannot be hoisted.
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/IteratedDominanceFrontier.h"
	#include "llvm/Analysis/MemoryDependenceAnalysis.h"
	#include "llvm/Analysis/MemorySSA.h"
	#include "llvm/Analysis/MemorySSAUpdater.h"
	#include "llvm/Analysis/PostDominators.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Scalar/GVN.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	#include <cassert>
	#include <iterator>
	#include <memory>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "gvn-hoist"

	STATISTIC(NumHoisted, "Number of instructions hoisted");
	STATISTIC(NumRemoved, "Number of instructions removed");
	STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
	STATISTIC(NumLoadsRemoved, "Number of loads removed");
	STATISTIC(NumStoresHoisted, "Number of stores hoisted");
	STATISTIC(NumStoresRemoved, "Number of stores removed");
	STATISTIC(NumCallsHoisted, "Number of calls hoisted");
	STATISTIC(NumCallsRemoved, "Number of calls removed");

	static cl::opt<int>
	MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
	cl::desc("Max number of instructions to hoist "
	"(default unlimited = -1)"));

	static cl::opt<int> MaxNumberOfBBSInPath(
	"gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
	cl::desc("Max number of basic blocks on the path between "
	"hoisting locations (default = 4, unlimited = -1)"));

	static cl::opt<int> MaxDepthInBB(
	"gvn-hoist-max-depth", cl::Hidden, cl::init(100),
	cl::desc("Hoist instructions from the beginning of the BB up to the "
	"maximum specified depth (default = 100, unlimited = -1)"));

	static cl::opt<int>
	MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
	cl::desc("Maximum length of dependent chains to hoist "
	"(default = 10, unlimited = -1)"));

	namespace llvm {

	using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
	using SmallVecInsn = SmallVector<Instruction *, 4>;
	using SmallVecImplInsn = SmallVectorImpl<Instruction *>;

	// Each element of a hoisting list contains the basic block where to hoist and
	// a list of instructions to be hoisted.
	using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;

	using HoistingPointList = SmallVector<HoistingPointInfo, 4>;

	// A map from a pair of VNs to all the instructions with those VNs.
	using VNType = std::pair<unsigned, unsigned>;

	using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;

	// CHI keeps information about values flowing out of a basic block. It is
	// similar to PHI but in the inverse graph, and used for outgoing values on each
	// edge. For conciseness, it is computed only for instructions with multiple
	// occurrences in the CFG because they are the only hoistable candidates.
	// A (CHI[{V, B, I1}, {V, C, I2}]
	// / \
	// / \
	// B(I1) C (I2)
	// The Value number for both I1 and I2 is V, the CHI node will save the
	// instruction as well as the edge where the value is flowing to.
	struct CHIArg {
	VNType VN;

	// Edge destination (shows the direction of flow), may not be where the I is.
	BasicBlock *Dest;

	// The instruction (VN) which uses the values flowing out of CHI.
	Instruction *I;

	bool operator==(const CHIArg &A) { return VN == A.VN; }
	bool operator!=(const CHIArg &A) { return !(*this == A); }
	};

	using CHIIt = SmallVectorImpl<CHIArg>::iterator;
	using CHIArgs = iterator_range<CHIIt>;
	using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
	using InValuesType =
	DenseMap<BasicBlock , SmallVector<std::pair<VNType, Instruction >, 2>>;

	// An invalid value number Used when inserting a single value number into
	// VNtoInsns.
	enum : unsigned { InvalidVN = ~2U };

	// Records all scalar instructions candidate for code hoisting.
	class InsnInfo {
	VNtoInsns VNtoScalars;

	public:
	// Inserts I and its value number in VNtoScalars.
	void insert(Instruction *I, GVN::ValueTable &VN) {
	// Scalar instruction.
	unsigned V = VN.lookupOrAdd(I);
	VNtoScalars[{V, InvalidVN}].push_back(I);
	}

	const VNtoInsns &getVNTable() const { return VNtoScalars; }
	};

	// Records all load instructions candidate for code hoisting.
	class LoadInfo {
	VNtoInsns VNtoLoads;

	public:
	// Insert Load and the value number of its memory address in VNtoLoads.
	void insert(LoadInst *Load, GVN::ValueTable &VN) {
	if (Load->isSimple()) {
	unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
	VNtoLoads[{V, InvalidVN}].push_back(Load);
	}
	}

	const VNtoInsns &getVNTable() const { return VNtoLoads; }
	};

	// Records all store instructions candidate for code hoisting.
	class StoreInfo {
	VNtoInsns VNtoStores;

	public:
	// Insert the Store and a hash number of the store address and the stored
	// value in VNtoStores.
	void insert(StoreInst *Store, GVN::ValueTable &VN) {
	if (!Store->isSimple())
	return;
	// Hash the store address and the stored value.
	Value *Ptr = Store->getPointerOperand();
	Value *Val = Store->getValueOperand();
	VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
	}

	const VNtoInsns &getVNTable() const { return VNtoStores; }
	};

	// Records all call instructions candidate for code hoisting.
	class CallInfo {
	VNtoInsns VNtoCallsScalars;
	VNtoInsns VNtoCallsLoads;
	VNtoInsns VNtoCallsStores;

	public:
	// Insert Call and its value numbering in one of the VNtoCalls* containers.
	void insert(CallInst *Call, GVN::ValueTable &VN) {
	// A call that doesNotAccessMemory is handled as a Scalar,
	// onlyReadsMemory will be handled as a Load instruction,
	// all other calls will be handled as stores.
	unsigned V = VN.lookupOrAdd(Call);
	auto Entry = std::make_pair(V, InvalidVN);

	if (Call->doesNotAccessMemory())
	VNtoCallsScalars[Entry].push_back(Call);
	else if (Call->onlyReadsMemory())
	VNtoCallsLoads[Entry].push_back(Call);
	else
	VNtoCallsStores[Entry].push_back(Call);
	}

	const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
	const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
	const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
	};

	static void combineKnownMetadata(Instruction ReplInst, Instruction I) {
	static const unsigned KnownIDs[] = {
	LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
	LLVMContext::MD_noalias, LLVMContext::MD_range,
	LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
	LLVMContext::MD_invariant_group};
	combineMetadata(ReplInst, I, KnownIDs);
	}

	// This pass hoists common computations across branches sharing common
	// dominator. The primary goal is to reduce the code size, and in some
	// cases reduce critical path (by exposing more ILP).
	class GVNHoist {
	public:
	GVNHoist(DominatorTree DT, PostDominatorTree PDT, AliasAnalysis *AA,
	MemoryDependenceResults MD, MemorySSA MSSA)
	: DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
	MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}

	bool run(Function &F) {
	NumFuncArgs = F.arg_size();
	VN.setDomTree(DT);
	VN.setAliasAnalysis(AA);
	VN.setMemDep(MD);
	bool Res = false;
	// Perform DFS Numbering of instructions.
	unsigned BBI = 0;
	for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
	DFSNumber[BB] = ++BBI;
	unsigned I = 0;
	for (auto &Inst : *BB)
	DFSNumber[&Inst] = ++I;
	}

	int ChainLength = 0;

	// FIXME: use lazy evaluation of VN to avoid the fix-point computation.
	while (true) {
	if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
	return Res;

	auto HoistStat = hoistExpressions(F);
	if (HoistStat.first + HoistStat.second == 0)
	return Res;

	if (HoistStat.second > 0)
	// To address a limitation of the current GVN, we need to rerun the
	// hoisting after we hoisted loads or stores in order to be able to
	// hoist all scalars dependent on the hoisted ld/st.
	VN.clear();

	Res = true;
	}

	return Res;
	}

	// Copied from NewGVN.cpp
	// This function provides global ranking of operations so that we can place
	// them in a canonical order. Note that rank alone is not necessarily enough
	// for a complete ordering, as constants all have the same rank. However,
	// generally, we will simplify an operation with all constants so that it
	// doesn't matter what order they appear in.
	unsigned int rank(const Value *V) const {
	// Prefer constants to undef to anything else
	// Undef is a constant, have to check it first.
	// Prefer smaller constants to constantexprs
	if (isa<ConstantExpr>(V))
	return 2;
	if (isa<UndefValue>(V))
	return 1;
	if (isa<Constant>(V))
	return 0;
	else if (auto *A = dyn_cast<Argument>(V))
	return 3 + A->getArgNo();

	// Need to shift the instruction DFS by number of arguments + 3 to account
	// for the constant and argument ranking above.
	auto Result = DFSNumber.lookup(V);
	if (Result > 0)
	return 4 + NumFuncArgs + Result;
	// Unreachable or something else, just return a really large number.
	return ~0;
	}

	private:
	GVN::ValueTable VN;
	DominatorTree *DT;
	PostDominatorTree *PDT;
	AliasAnalysis *AA;
	MemoryDependenceResults *MD;
	MemorySSA *MSSA;
	std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
	DenseMap<const Value *, unsigned> DFSNumber;
	BBSideEffectsSet BBSideEffects;
	DenseSet<const BasicBlock *> HoistBarrier;
	SmallVector<BasicBlock *, 32> IDFBlocks;
	unsigned NumFuncArgs;
	const bool HoistingGeps = false;

	enum InsKind { Unknown, Scalar, Load, Store };

	// Return true when there are exception handling in BB.
	bool hasEH(const BasicBlock *BB) {
	auto It = BBSideEffects.find(BB);
	if (It != BBSideEffects.end())
	return It->second;

	if (BB->isEHPad() \|\| BB->hasAddressTaken()) {
	BBSideEffects[BB] = true;
	return true;
	}

	if (BB->getTerminator()->mayThrow()) {
	BBSideEffects[BB] = true;
	return true;
	}

	BBSideEffects[BB] = false;
	return false;
	}

	// Return true when a successor of BB dominates A.
	bool successorDominate(const BasicBlock BB, const BasicBlock A) {
	for (const BasicBlock *Succ : BB->getTerminator()->successors())
	if (DT->dominates(Succ, A))
	return true;

	return false;
	}

	// Return true when I1 appears before I2 in the instructions of BB.
	bool firstInBB(const Instruction I1, const Instruction I2) {
	assert(I1->getParent() == I2->getParent());
	unsigned I1DFS = DFSNumber.lookup(I1);
	unsigned I2DFS = DFSNumber.lookup(I2);
	assert(I1DFS && I2DFS);
	return I1DFS < I2DFS;
	}

	// Return true when there are memory uses of Def in BB.
	bool hasMemoryUse(const Instruction NewPt, MemoryDef Def,
	const BasicBlock *BB) {
	const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
	if (!Acc)
	return false;

	Instruction *OldPt = Def->getMemoryInst();
	const BasicBlock *OldBB = OldPt->getParent();
	const BasicBlock *NewBB = NewPt->getParent();
	bool ReachedNewPt = false;

	for (const MemoryAccess &MA : *Acc)
	if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
	Instruction *Insn = MU->getMemoryInst();

	// Do not check whether MU aliases Def when MU occurs after OldPt.
	if (BB == OldBB && firstInBB(OldPt, Insn))
	break;

	// Do not check whether MU aliases Def when MU occurs before NewPt.
	if (BB == NewBB) {
	if (!ReachedNewPt) {
	if (firstInBB(Insn, NewPt))
	continue;
	ReachedNewPt = true;
	}
	}
	if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
	return true;
	}

	return false;
	}

	bool hasEHhelper(const BasicBlock BB, const BasicBlock SrcBB,
	int &NBBsOnAllPaths) {
	// Stop walk once the limit is reached.
	if (NBBsOnAllPaths == 0)
	return true;

	// Impossible to hoist with exceptions on the path.
	if (hasEH(BB))
	return true;

	// No such instruction after HoistBarrier in a basic block was
	// selected for hoisting so instructions selected within basic block with
	// a hoist barrier can be hoisted.
	if ((BB != SrcBB) && HoistBarrier.count(BB))
	return true;

	return false;
	}

	// Return true when there are exception handling or loads of memory Def
	// between Def and NewPt. This function is only called for stores: Def is
	// the MemoryDef of the store to be hoisted.

	// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
	// return true when the counter NBBsOnAllPaths reaces 0, except when it is
	// initialized to -1 which is unlimited.
	bool hasEHOrLoadsOnPath(const Instruction NewPt, MemoryDef Def,
	int &NBBsOnAllPaths) {
	const BasicBlock *NewBB = NewPt->getParent();
	const BasicBlock *OldBB = Def->getBlock();
	assert(DT->dominates(NewBB, OldBB) && "invalid path");
	assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
	"def does not dominate new hoisting point");

	// Walk all basic blocks reachable in depth-first iteration on the inverse
	// CFG from OldBB to NewBB. These blocks are all the blocks that may be
	// executed between the execution of NewBB and OldBB. Hoisting an expression
	// from OldBB into NewBB has to be safe on all execution paths.
	for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
	const BasicBlock BB = I;
	if (BB == NewBB) {
	// Stop traversal when reaching HoistPt.
	I.skipChildren();
	continue;
	}

	if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
	return true;

	// Check that we do not move a store past loads.
	if (hasMemoryUse(NewPt, Def, BB))
	return true;

	// -1 is unlimited number of blocks on all paths.
	if (NBBsOnAllPaths != -1)
	--NBBsOnAllPaths;

	++I;
	}

	return false;
	}

	// Return true when there are exception handling between HoistPt and BB.
	// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
	// return true when the counter NBBsOnAllPaths reaches 0, except when it is
	// initialized to -1 which is unlimited.
	bool hasEHOnPath(const BasicBlock HoistPt, const BasicBlock SrcBB,
	int &NBBsOnAllPaths) {
	assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");

	// Walk all basic blocks reachable in depth-first iteration on
	// the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
	// blocks that may be executed between the execution of NewHoistPt and
	// BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
	// on all execution paths.
	for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
	const BasicBlock BB = I;
	if (BB == HoistPt) {
	// Stop traversal when reaching NewHoistPt.
	I.skipChildren();
	continue;
	}

	if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
	return true;

	// -1 is unlimited number of blocks on all paths.
	if (NBBsOnAllPaths != -1)
	--NBBsOnAllPaths;

	++I;
	}

	return false;
	}

	// Return true when it is safe to hoist a memory load or store U from OldPt
	// to NewPt.
	bool safeToHoistLdSt(const Instruction NewPt, const Instruction OldPt,
	MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
	// In place hoisting is safe.
	if (NewPt == OldPt)
	return true;

	const BasicBlock *NewBB = NewPt->getParent();
	const BasicBlock *OldBB = OldPt->getParent();
	const BasicBlock *UBB = U->getBlock();

	// Check for dependences on the Memory SSA.
	MemoryAccess *D = U->getDefiningAccess();
	BasicBlock *DBB = D->getBlock();
	if (DT->properlyDominates(NewBB, DBB))
	// Cannot move the load or store to NewBB above its definition in DBB.
	return false;

	if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
	if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
	if (firstInBB(NewPt, UD->getMemoryInst()))
	// Cannot move the load or store to NewPt above its definition in D.
	return false;

	// Check for unsafe hoistings due to side effects.
	if (K == InsKind::Store) {
	if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
	return false;
	} else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
	return false;

	if (UBB == NewBB) {
	if (DT->properlyDominates(DBB, NewBB))
	return true;
	assert(UBB == DBB);
	assert(MSSA->locallyDominates(D, U));
	}

	// No side effects: it is safe to hoist.
	return true;
	}

	// Return true when it is safe to hoist scalar instructions from all blocks in
	// WL to HoistBB.
	bool safeToHoistScalar(const BasicBlock HoistBB, const BasicBlock BB,
	int &NBBsOnAllPaths) {
	return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
	}

	// In the inverse CFG, the dominance frontier of basic block (BB) is the
	// point where ANTIC needs to be computed for instructions which are going
	// to be hoisted. Since this point does not change during gvn-hoist,
	// we compute it only once (on demand).
	// The ides is inspired from:
	// "Partial Redundancy Elimination in SSA Form"
	// ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
	// They use similar idea in the forward graph to to find fully redundant and
	// partially redundant expressions, here it is used in the inverse graph to
	// find fully anticipable instructions at merge point (post-dominator in
	// the inverse CFG).
	// Returns the edge via which an instruction in BB will get the values from.

	// Returns true when the values are flowing out to each edge.
	bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
	if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
	return false; // Not enough args in this CHI.

	for (auto CHI : C) {
	BasicBlock *Dest = CHI.Dest;
	// Find if all the edges have values flowing out of BB.
	bool Found = llvm::any_of(TI->successors(), [Dest](const BasicBlock *BB) {
	return BB == Dest; });
	if (!Found)
	return false;
	}
	return true;
	}

	// Check if it is safe to hoist values tracked by CHI in the range
	// [Begin, End) and accumulate them in Safe.
	void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
	SmallVectorImpl<CHIArg> &Safe) {
	int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
	for (auto CHI : C) {
	Instruction *Insn = CHI.I;
	if (!Insn) // No instruction was inserted in this CHI.
	continue;
	if (K == InsKind::Scalar) {
	if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
	Safe.push_back(CHI);
	} else {
	MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
	if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
	Safe.push_back(CHI);
	}
	}
	}

	using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;

	// Push all the VNs corresponding to BB into RenameStack.
	void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
	RenameStackType &RenameStack) {
	auto it1 = ValueBBs.find(BB);
	if (it1 != ValueBBs.end()) {
	// Iterate in reverse order to keep lower ranked values on the top.
	for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
	// Get the value of instruction I
	DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
	RenameStack[VI.first].push_back(VI.second);
	}
	}
	}

	void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
	RenameStackType &RenameStack) {
	// For each predecessor (because Post-DOM) of BB check if it has a CHI
	for (auto Pred : predecessors(BB)) {
	auto P = CHIBBs.find(Pred);
	if (P == CHIBBs.end()) {
	continue;
	}
	DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
	// A CHI is found (BB -> Pred is an edge in the CFG)
	// Pop the stack until Top(V) = Ve.
	auto &VCHI = P->second;
	for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
	CHIArg &C = *It;
	if (!C.Dest) {
	auto si = RenameStack.find(C.VN);
	// The Basic Block where CHI is must dominate the value we want to
	// track in a CHI. In the PDom walk, there can be values in the
	// stack which are not control dependent e.g., nested loop.
	if (si != RenameStack.end() && si->second.size() &&
	- DT->dominates(Pred, si->second.back()->getParent())) {
	+ DT->properlyDominates(Pred, si->second.back()->getParent())) {
	C.Dest = BB; // Assign the edge
	C.I = si->second.pop_back_val(); // Assign the argument
	DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
	<< *C.I << ", VN: " << C.VN.first << ", "
	<< C.VN.second);
	}
	// Move to next CHI of a different value
	It = std::find_if(It, VCHI.end(),
	[It](CHIArg &A) { return A != *It; });
	} else
	++It;
	}
	}
	}

	// Walk the post-dominator tree top-down and use a stack for each value to
	// store the last value you see. When you hit a CHI from a given edge, the
	// value to use as the argument is at the top of the stack, add the value to
	// CHI and pop.
	void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
	auto Root = PDT->getNode(nullptr);
	if (!Root)
	return;
	// Depth first walk on PDom tree to fill the CHIargs at each PDF.
	RenameStackType RenameStack;
	for (auto Node : depth_first(Root)) {
	BasicBlock *BB = Node->getBlock();
	if (!BB)
	continue;

	// Collect all values in BB and push to stack.
	fillRenameStack(BB, ValueBBs, RenameStack);

	// Fill outgoing values in each CHI corresponding to BB.
	fillChiArgs(BB, CHIBBs, RenameStack);
	}
	}

	// Walk all the CHI-nodes to find ones which have a empty-entry and remove
	// them Then collect all the instructions which are safe to hoist and see if
	// they form a list of anticipable values. OutValues contains CHIs
	// corresponding to each basic block.
	void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
	HoistingPointList &HPL) {
	auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };

	// CHIArgs now have the outgoing values, so check for anticipability and
	// accumulate hoistable candidates in HPL.
	for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
	BasicBlock *BB = A.first;
	SmallVectorImpl<CHIArg> &CHIs = A.second;
	// Vector of PHIs contains PHIs for different instructions.
	// Sort the args according to their VNs, such that identical
	// instructions are together.
	std::stable_sort(CHIs.begin(), CHIs.end(), cmpVN);
	auto TI = BB->getTerminator();
	auto B = CHIs.begin();
	// [PreIt, PHIIt) form a range of CHIs which have identical VNs.
	auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
	[B](CHIArg &A) { return A != *B; });
	auto PrevIt = CHIs.begin();
	while (PrevIt != PHIIt) {
	// Collect values which satisfy safety checks.
	SmallVector<CHIArg, 2> Safe;
	// We check for safety first because there might be multiple values in
	// the same path, some of which are not safe to be hoisted, but overall
	// each edge has at least one value which can be hoisted, making the
	// value anticipable along that path.
	checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);

	// List of safe values should be anticipable at TI.
	if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
	HPL.push_back({BB, SmallVecInsn()});
	SmallVecInsn &V = HPL.back().second;
	for (auto B : Safe)
	V.push_back(B.I);
	}

	// Check other VNs
	PrevIt = PHIIt;
	PHIIt = std::find_if(PrevIt, CHIs.end(),
	[PrevIt](CHIArg &A) { return A != *PrevIt; });
	}
	}
	}

	// Compute insertion points for each values which can be fully anticipated at
	// a dominator. HPL contains all such values.
	void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
	InsKind K) {
	// Sort VNs based on their rankings
	std::vector<VNType> Ranks;
	for (const auto &Entry : Map) {
	Ranks.push_back(Entry.first);
	}

	// TODO: Remove fully-redundant expressions.
	// Get instruction from the Map, assume that all the Instructions
	// with same VNs have same rank (this is an approximation).
	std::sort(Ranks.begin(), Ranks.end(),
	[this, &Map](const VNType &r1, const VNType &r2) {
	return (rank(*Map.lookup(r1).begin()) <
	rank(*Map.lookup(r2).begin()));
	});

	// - Sort VNs according to their rank, and start with lowest ranked VN
	// - Take a VN and for each instruction with same VN
	// - Find the dominance frontier in the inverse graph (PDF)
	// - Insert the chi-node at PDF
	// - Remove the chi-nodes with missing entries
	// - Remove values from CHI-nodes which do not truly flow out, e.g.,
	// modified along the path.
	// - Collect the remaining values that are still anticipable
	SmallVector<BasicBlock *, 2> IDFBlocks;
	ReverseIDFCalculator IDFs(*PDT);
	OutValuesType OutValue;
	InValuesType InValue;
	for (const auto &R : Ranks) {
	const SmallVecInsn &V = Map.lookup(R);
	if (V.size() < 2)
	continue;
	const VNType &VN = R;
	SmallPtrSet<BasicBlock *, 2> VNBlocks;
	for (auto &I : V) {
	BasicBlock *BBI = I->getParent();
	if (!hasEH(BBI))
	VNBlocks.insert(BBI);
	}
	// Compute the Post Dominance Frontiers of each basic block
	// The dominance frontier of a live block X in the reverse
	// control graph is the set of blocks upon which X is control
	// dependent. The following sequence computes the set of blocks
	// which currently have dead terminators that are control
	// dependence sources of a block which is in NewLiveBlocks.
	IDFs.setDefiningBlocks(VNBlocks);
	IDFs.calculate(IDFBlocks);

	// Make a map of BB vs instructions to be hoisted.
	for (unsigned i = 0; i < V.size(); ++i) {
	InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
	}
	// Insert empty CHI node for this VN. This is used to factor out
	// basic blocks where the ANTIC can potentially change.
	for (auto IDFB : IDFBlocks) { // TODO: Prune out useless CHI insertions.
	for (unsigned i = 0; i < V.size(); ++i) {
	CHIArg C = {VN, nullptr, nullptr};
	// Ignore spurious PDFs.
	if (DT->properlyDominates(IDFB, V[i]->getParent())) {
	OutValue[IDFB].push_back(C);
	DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
	<< ", for Insn: " << *V[i]);
	}
	}
	}
	}

	// Insert CHI args at each PDF to iterate on factored graph of
	// control dependence.
	insertCHI(InValue, OutValue);
	// Using the CHI args inserted at each PDF, find fully anticipable values.
	findHoistableCandidates(OutValue, K, HPL);
	}

	// Return true when all operands of Instr are available at insertion point
	// HoistPt. When limiting the number of hoisted expressions, one could hoist
	// a load without hoisting its access function. So before hoisting any
	// expression, make sure that all its operands are available at insert point.
	bool allOperandsAvailable(const Instruction *I,
	const BasicBlock *HoistPt) const {
	for (const Use &Op : I->operands())
	if (const auto *Inst = dyn_cast<Instruction>(&Op))
	if (!DT->dominates(Inst->getParent(), HoistPt))
	return false;

	return true;
	}

	// Same as allOperandsAvailable with recursive check for GEP operands.
	bool allGepOperandsAvailable(const Instruction *I,
	const BasicBlock *HoistPt) const {
	for (const Use &Op : I->operands())
	if (const auto *Inst = dyn_cast<Instruction>(&Op))
	if (!DT->dominates(Inst->getParent(), HoistPt)) {
	if (const GetElementPtrInst *GepOp =
	dyn_cast<GetElementPtrInst>(Inst)) {
	if (!allGepOperandsAvailable(GepOp, HoistPt))
	return false;
	// Gep is available if all operands of GepOp are available.
	} else {
	// Gep is not available if it has operands other than GEPs that are
	// defined in blocks not dominating HoistPt.
	return false;
	}
	}
	return true;
	}

	// Make all operands of the GEP available.
	void makeGepsAvailable(Instruction Repl, BasicBlock HoistPt,
	const SmallVecInsn &InstructionsToHoist,
	Instruction *Gep) const {
	assert(allGepOperandsAvailable(Gep, HoistPt) &&
	"GEP operands not available");

	Instruction *ClonedGep = Gep->clone();
	for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
	if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
	// Check whether the operand is already available.
	if (DT->dominates(Op->getParent(), HoistPt))
	continue;

	// As a GEP can refer to other GEPs, recursively make all the operands
	// of this GEP available at HoistPt.
	if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
	makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
	}

	// Copy Gep and replace its uses in Repl with ClonedGep.
	ClonedGep->insertBefore(HoistPt->getTerminator());

	// Conservatively discard any optimization hints, they may differ on the
	// other paths.
	ClonedGep->dropUnknownNonDebugMetadata();

	// If we have optimization hints which agree with each other along different
	// paths, preserve them.
	for (const Instruction *OtherInst : InstructionsToHoist) {
	const GetElementPtrInst *OtherGep;
	if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
	OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
	else
	OtherGep = cast<GetElementPtrInst>(
	cast<StoreInst>(OtherInst)->getPointerOperand());
	ClonedGep->andIRFlags(OtherGep);
	}

	// Replace uses of Gep with ClonedGep in Repl.
	Repl->replaceUsesOfWith(Gep, ClonedGep);
	}

	void updateAlignment(Instruction I, Instruction Repl) {
	if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
	ReplacementLoad->setAlignment(
	std::min(ReplacementLoad->getAlignment(),
	cast<LoadInst>(I)->getAlignment()));
	++NumLoadsRemoved;
	} else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
	ReplacementStore->setAlignment(
	std::min(ReplacementStore->getAlignment(),
	cast<StoreInst>(I)->getAlignment()));
	++NumStoresRemoved;
	} else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
	ReplacementAlloca->setAlignment(
	std::max(ReplacementAlloca->getAlignment(),
	cast<AllocaInst>(I)->getAlignment()));
	} else if (isa<CallInst>(Repl)) {
	++NumCallsRemoved;
	}
	}

	// Remove all the instructions in Candidates and replace their usage with Repl.
	// Returns the number of instructions removed.
	unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
	MemoryUseOrDef *NewMemAcc) {
	unsigned NR = 0;
	for (Instruction *I : Candidates) {
	if (I != Repl) {
	++NR;
	updateAlignment(I, Repl);
	if (NewMemAcc) {
	// Update the uses of the old MSSA access with NewMemAcc.
	MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
	OldMA->replaceAllUsesWith(NewMemAcc);
	MSSAUpdater->removeMemoryAccess(OldMA);
	}

	Repl->andIRFlags(I);
	combineKnownMetadata(Repl, I);
	I->replaceAllUsesWith(Repl);
	// Also invalidate the Alias Analysis cache.
	MD->removeInstruction(I);
	I->eraseFromParent();
	}
	}
	return NR;
	}

	// Replace all Memory PHI usage with NewMemAcc.
	void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
	SmallPtrSet<MemoryPhi *, 4> UsePhis;
	for (User *U : NewMemAcc->users())
	if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
	UsePhis.insert(Phi);

	for (MemoryPhi *Phi : UsePhis) {
	auto In = Phi->incoming_values();
	if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
	Phi->replaceAllUsesWith(NewMemAcc);
	MSSAUpdater->removeMemoryAccess(Phi);
	}
	}
	}

	// Remove all other instructions and replace them with Repl.
	unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
	BasicBlock *DestBB, bool MoveAccess) {
	MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
	if (MoveAccess && NewMemAcc) {
	// The definition of this ld/st will not change: ld/st hoisting is
	// legal when the ld/st is not moved past its current definition.
	MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End);
	}

	// Replace all other instructions with Repl with memory access NewMemAcc.
	unsigned NR = rauw(Candidates, Repl, NewMemAcc);

	// Remove MemorySSA phi nodes with the same arguments.
	if (NewMemAcc)
	raMPHIuw(NewMemAcc);
	return NR;
	}

	// In the case Repl is a load or a store, we make all their GEPs
	// available: GEPs are not hoisted by default to avoid the address
	// computations to be hoisted without the associated load or store.
	bool makeGepOperandsAvailable(Instruction Repl, BasicBlock HoistPt,
	const SmallVecInsn &InstructionsToHoist) const {
	// Check whether the GEP of a ld/st can be synthesized at HoistPt.
	GetElementPtrInst *Gep = nullptr;
	Instruction *Val = nullptr;
	if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
	Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
	} else if (auto *St = dyn_cast<StoreInst>(Repl)) {
	Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
	Val = dyn_cast<Instruction>(St->getValueOperand());
	// Check that the stored value is available.
	if (Val) {
	if (isa<GetElementPtrInst>(Val)) {
	// Check whether we can compute the GEP at HoistPt.
	if (!allGepOperandsAvailable(Val, HoistPt))
	return false;
	} else if (!DT->dominates(Val->getParent(), HoistPt))
	return false;
	}
	}

	// Check whether we can compute the Gep at HoistPt.
	if (!Gep \|\| !allGepOperandsAvailable(Gep, HoistPt))
	return false;

	makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);

	if (Val && isa<GetElementPtrInst>(Val))
	makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);

	return true;
	}

	std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
	unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
	for (const HoistingPointInfo &HP : HPL) {
	// Find out whether we already have one of the instructions in HoistPt,
	// in which case we do not have to move it.
	BasicBlock *DestBB = HP.first;
	const SmallVecInsn &InstructionsToHoist = HP.second;
	Instruction *Repl = nullptr;
	for (Instruction *I : InstructionsToHoist)
	if (I->getParent() == DestBB)
	// If there are two instructions in HoistPt to be hoisted in place:
	// update Repl to be the first one, such that we can rename the uses
	// of the second based on the first.
	if (!Repl \|\| firstInBB(I, Repl))
	Repl = I;

	// Keep track of whether we moved the instruction so we know whether we
	// should move the MemoryAccess.
	bool MoveAccess = true;
	if (Repl) {
	// Repl is already in HoistPt: it remains in place.
	assert(allOperandsAvailable(Repl, DestBB) &&
	"instruction depends on operands that are not available");
	MoveAccess = false;
	} else {
	// When we do not find Repl in HoistPt, select the first in the list
	// and move it to HoistPt.
	Repl = InstructionsToHoist.front();

	// We can move Repl in HoistPt only when all operands are available.
	// The order in which hoistings are done may influence the availability
	// of operands.
	if (!allOperandsAvailable(Repl, DestBB)) {
	// When HoistingGeps there is nothing more we can do to make the
	// operands available: just continue.
	if (HoistingGeps)
	continue;

	// When not HoistingGeps we need to copy the GEPs.
	if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
	continue;
	}

	// Move the instruction at the end of HoistPt.
	Instruction *Last = DestBB->getTerminator();
	MD->removeInstruction(Repl);
	Repl->moveBefore(Last);

	DFSNumber[Repl] = DFSNumber[Last]++;
	}

	NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);

	if (isa<LoadInst>(Repl))
	++NL;
	else if (isa<StoreInst>(Repl))
	++NS;
	else if (isa<CallInst>(Repl))
	++NC;
	else // Scalar
	++NI;
	}

	NumHoisted += NL + NS + NC + NI;
	NumRemoved += NR;
	NumLoadsHoisted += NL;
	NumStoresHoisted += NS;
	NumCallsHoisted += NC;
	return {NI, NL + NC + NS};
	}

	// Hoist all expressions. Returns Number of scalars hoisted
	// and number of non-scalars hoisted.
	std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
	InsnInfo II;
	LoadInfo LI;
	StoreInfo SI;
	CallInfo CI;
	for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
	int InstructionNb = 0;
	for (Instruction &I1 : *BB) {
	// If I1 cannot guarantee progress, subsequent instructions
	// in BB cannot be hoisted anyways.
	if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
	HoistBarrier.insert(BB);
	break;
	}
	// Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
	// deeper may increase the register pressure and compilation time.
	if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
	break;

	// Do not value number terminator instructions.
	if (isa<TerminatorInst>(&I1))
	break;

	if (auto *Load = dyn_cast<LoadInst>(&I1))
	LI.insert(Load, VN);
	else if (auto *Store = dyn_cast<StoreInst>(&I1))
	SI.insert(Store, VN);
	else if (auto *Call = dyn_cast<CallInst>(&I1)) {
	if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
	if (isa<DbgInfoIntrinsic>(Intr) \|\|
	Intr->getIntrinsicID() == Intrinsic::assume \|\|
	Intr->getIntrinsicID() == Intrinsic::sideeffect)
	continue;
	}
	if (Call->mayHaveSideEffects())
	break;

	if (Call->isConvergent())
	break;

	CI.insert(Call, VN);
	} else if (HoistingGeps \|\| !isa<GetElementPtrInst>(&I1))
	// Do not hoist scalars past calls that may write to memory because
	// that could result in spills later. geps are handled separately.
	// TODO: We can relax this for targets like AArch64 as they have more
	// registers than X86.
	II.insert(&I1, VN);
	}
	}

	HoistingPointList HPL;
	computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
	computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
	computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
	computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
	computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
	computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
	return hoist(HPL);
	}
	};

	class GVNHoistLegacyPass : public FunctionPass {
	public:
	static char ID;

	GVNHoistLegacyPass() : FunctionPass(ID) {
	initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;
	auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
	auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
	auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
	auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();

	GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
	return G.run(F);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<PostDominatorTreeWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<MemoryDependenceWrapperPass>();
	AU.addRequired<MemorySSAWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<MemorySSAWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	}
	};

	} // end namespace llvm

	PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
	PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
	AliasAnalysis &AA = AM.getResult<AAManager>(F);
	MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
	MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
	GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
	if (!G.run(F))
	return PreservedAnalyses::all();

	PreservedAnalyses PA;
	PA.preserve<DominatorTreeAnalysis>();
	PA.preserve<MemorySSAAnalysis>();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	char GVNHoistLegacyPass::ID = 0;

	INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
	"Early GVN Hoisting of Expressions", false, false)
	INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
	"Early GVN Hoisting of Expressions", false, false)

	FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
	Index: vendor/llvm/dist-release_60/lib/Transforms/Scalar/StructurizeCFG.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Transforms/Scalar/StructurizeCFG.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Transforms/Scalar/StructurizeCFG.cpp (revision 328362)
	@@ -1,953 +1,899 @@
	//===- StructurizeCFG.cpp -------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/DivergenceAnalysis.h"
	-#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/RegionInfo.h"
	#include "llvm/Analysis/RegionIterator.h"
	#include "llvm/Analysis/RegionPass.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/SSAUpdater.h"
	#include <algorithm>
	#include <cassert>
	#include <utility>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "structurizecfg"

	// The name for newly created blocks.
	static const char *const FlowBlockName = "Flow";

	namespace {

	// Definition of the complex types used in this pass.

	using BBValuePair = std::pair<BasicBlock , Value >;

	using RNVector = SmallVector<RegionNode *, 8>;
	using BBVector = SmallVector<BasicBlock *, 8>;
	using BranchVector = SmallVector<BranchInst *, 8>;
	using BBValueVector = SmallVector<BBValuePair, 2>;

	using BBSet = SmallPtrSet<BasicBlock *, 8>;

	using PhiMap = MapVector<PHINode *, BBValueVector>;
	using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;

	using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
	using BBPredicates = DenseMap<BasicBlock , Value >;
	using PredMap = DenseMap<BasicBlock *, BBPredicates>;
	using BB2BBMap = DenseMap<BasicBlock , BasicBlock >;

	/// Finds the nearest common dominator of a set of BasicBlocks.
	///
	/// For every BB you add to the set, you can specify whether we "remember" the
	/// block. When you get the common dominator, you can also ask whether it's one
	/// of the blocks we remembered.
	class NearestCommonDominator {
	DominatorTree *DT;
	BasicBlock *Result = nullptr;
	bool ResultIsRemembered = false;

	/// Add BB to the resulting dominator.
	void addBlock(BasicBlock *BB, bool Remember) {
	if (!Result) {
	Result = BB;
	ResultIsRemembered = Remember;
	return;
	}

	BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
	if (NewResult != Result)
	ResultIsRemembered = false;
	if (NewResult == BB)
	ResultIsRemembered \|= Remember;
	Result = NewResult;
	}

	public:
	explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}

	void addBlock(BasicBlock *BB) {
	addBlock(BB, /* Remember = */ false);
	}

	void addAndRememberBlock(BasicBlock *BB) {
	addBlock(BB, /* Remember = */ true);
	}

	/// Get the nearest common dominator of all the BBs added via addBlock() and
	/// addAndRememberBlock().
	BasicBlock *result() { return Result; }

	/// Is the BB returned by getResult() one of the blocks we added to the set
	/// with addAndRememberBlock()?
	bool resultIsRememberedBlock() { return ResultIsRemembered; }
	};

	/// @brief Transforms the control flow graph on one single entry/exit region
	/// at a time.
	///
	/// After the transform all "If"/"Then"/"Else" style control flow looks like
	/// this:
	///
	/// \verbatim
	/// 1
	/// \|\|
	/// \| \|
	/// 2 \|
	/// \| /
	/// \|/
	/// 3
	/// \|\| Where:
	/// \| \| 1 = "If" block, calculates the condition
	/// 4 \| 2 = "Then" subregion, runs if the condition is true
	/// \| / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
	/// \|/ 4 = "Else" optional subregion, runs if the condition is false
	/// 5 5 = "End" block, also rejoins the control flow
	/// \endverbatim
	///
	/// Control flow is expressed as a branch where the true exit goes into the
	/// "Then"/"Else" region, while the false exit skips the region
	/// The condition for the optional "Else" region is expressed as a PHI node.
	/// The incoming values of the PHI node are true for the "If" edge and false
	/// for the "Then" edge.
	///
	/// Additionally to that even complicated loops look like this:
	///
	/// \verbatim
	/// 1
	/// \|\|
	/// \| \|
	/// 2 ^ Where:
	/// \| / 1 = "Entry" block
	/// \|/ 2 = "Loop" optional subregion, with all exits at "Flow" block
	/// 3 3 = "Flow" block, with back edge to entry block
	/// \|
	/// \endverbatim
	///
	/// The back edge of the "Flow" block is always on the false side of the branch
	/// while the true side continues the general flow. So the loop condition
	/// consist of a network of PHI nodes where the true incoming values expresses
	/// breaks and the false values expresses continue states.
	class StructurizeCFG : public RegionPass {
	bool SkipUniformRegions;

	Type *Boolean;
	ConstantInt *BoolTrue;
	ConstantInt *BoolFalse;
	UndefValue *BoolUndef;

	Function *Func;
	Region *ParentRegion;

	DominatorTree *DT;
	- LoopInfo *LI;

	- SmallVector<RegionNode *, 8> Order;
	+ std::deque<RegionNode *> Order;
	BBSet Visited;

	BBPhiMap DeletedPhis;
	BB2BBVecMap AddedPhis;

	PredMap Predicates;
	BranchVector Conditions;

	BB2BBMap Loops;
	PredMap LoopPreds;
	BranchVector LoopConds;

	RegionNode *PrevNode;

	void orderNodes();

	void analyzeLoops(RegionNode *N);

	Value invert(Value Condition);

	Value buildCondition(BranchInst Term, unsigned Idx, bool Invert);

	void gatherPredicates(RegionNode *N);

	- void collectInfos();
	+ void analyzeNode(RegionNode *N);

	void insertConditions(bool Loops);

	void delPhiValues(BasicBlock From, BasicBlock To);

	void addPhiValues(BasicBlock From, BasicBlock To);

	void setPhiValues();

	void killTerminator(BasicBlock *BB);

	void changeExit(RegionNode Node, BasicBlock NewExit,
	bool IncludeDominator);

	BasicBlock getNextFlow(BasicBlock Dominator);

	BasicBlock *needPrefix(bool NeedEmpty);

	BasicBlock needPostfix(BasicBlock Flow, bool ExitUseAllowed);

	void setPrevNode(BasicBlock *BB);

	bool dominatesPredicates(BasicBlock BB, RegionNode Node);

	bool isPredictableTrue(RegionNode *Node);

	void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);

	void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);

	void createFlow();

	void rebuildSSA();

	public:
	static char ID;

	explicit StructurizeCFG(bool SkipUniformRegions = false)
	: RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
	initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
	}

	bool doInitialization(Region *R, RGPassManager &RGM) override;

	bool runOnRegion(Region *R, RGPassManager &RGM) override;

	StringRef getPassName() const override { return "Structurize control flow"; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	if (SkipUniformRegions)
	AU.addRequired<DivergenceAnalysis>();
	AU.addRequiredID(LowerSwitchID);
	AU.addRequired<DominatorTreeWrapperPass>();
	- AU.addRequired<LoopInfoWrapperPass>();

	AU.addPreserved<DominatorTreeWrapperPass>();
	RegionPass::getAnalysisUsage(AU);
	}
	};

	} // end anonymous namespace

	char StructurizeCFG::ID = 0;

	INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
	false, false)
	INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
	INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
	INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
	false, false)

	/// \brief Initialize the types and constants used in the pass
	bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
	LLVMContext &Context = R->getEntry()->getContext();

	Boolean = Type::getInt1Ty(Context);
	BoolTrue = ConstantInt::getTrue(Context);
	BoolFalse = ConstantInt::getFalse(Context);
	BoolUndef = UndefValue::get(Boolean);

	return false;
	}

	/// \brief Build up the general order of nodes
	void StructurizeCFG::orderNodes() {
	- ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
	- SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
	+ assert(Visited.empty());
	+ assert(Predicates.empty());
	+ assert(Loops.empty());
	+ assert(LoopPreds.empty());

	- // The reverse post-order traversal of the list gives us an ordering close
	- // to what we want. The only problem with it is that sometimes backedges
	- // for outer loops will be visited before backedges for inner loops.
	- for (RegionNode *RN : RPOT) {
	- BasicBlock *BB = RN->getEntry();
	- Loop *Loop = LI->getLoopFor(BB);
	- ++LoopBlocks[Loop];
	+ // This must be RPO order for the back edge detection to work
	+ for (RegionNode RN : ReversePostOrderTraversal<Region>(ParentRegion)) {
	+ // FIXME: Is there a better order to use for structurization?
	+ Order.push_back(RN);
	+ analyzeNode(RN);
	}
	-
	- unsigned CurrentLoopDepth = 0;
	- Loop *CurrentLoop = nullptr;
	- for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
	- BasicBlock BB = (I)->getEntry();
	- unsigned LoopDepth = LI->getLoopDepth(BB);
	-
	- if (is_contained(Order, *I))
	- continue;
	-
	- if (LoopDepth < CurrentLoopDepth) {
	- // Make sure we have visited all blocks in this loop before moving back to
	- // the outer loop.
	-
	- auto LoopI = I;
	- while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
	- LoopI++;
	- BasicBlock LoopBB = (LoopI)->getEntry();
	- if (LI->getLoopFor(LoopBB) == CurrentLoop) {
	- --BlockCount;
	- Order.push_back(*LoopI);
	- }
	- }
	- }
	-
	- CurrentLoop = LI->getLoopFor(BB);
	- if (CurrentLoop)
	- LoopBlocks[CurrentLoop]--;
	-
	- CurrentLoopDepth = LoopDepth;
	- Order.push_back(*I);
	- }
	-
	- // This pass originally used a post-order traversal and then operated on
	- // the list in reverse. Now that we are using a reverse post-order traversal
	- // rather than re-working the whole pass to operate on the list in order,
	- // we just reverse the list and continue to operate on it in reverse.
	- std::reverse(Order.begin(), Order.end());
	}

	/// \brief Determine the end of the loops
	void StructurizeCFG::analyzeLoops(RegionNode *N) {
	if (N->isSubRegion()) {
	// Test for exit as back edge
	BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
	if (Visited.count(Exit))
	Loops[Exit] = N->getEntry();

	} else {
	// Test for successors as back edge
	BasicBlock *BB = N->getNodeAs<BasicBlock>();
	BranchInst *Term = cast<BranchInst>(BB->getTerminator());

	for (BasicBlock *Succ : Term->successors())
	if (Visited.count(Succ))
	Loops[Succ] = BB;
	}
	}

	/// \brief Invert the given condition
	Value StructurizeCFG::invert(Value Condition) {
	// First: Check if it's a constant
	if (Constant *C = dyn_cast<Constant>(Condition))
	return ConstantExpr::getNot(C);

	// Second: If the condition is already inverted, return the original value
	if (match(Condition, m_Not(m_Value(Condition))))
	return Condition;

	if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
	// Third: Check all the users for an invert
	BasicBlock *Parent = Inst->getParent();
	for (User *U : Condition->users())
	if (Instruction *I = dyn_cast<Instruction>(U))
	if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
	return I;

	// Last option: Create a new instruction
	return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
	}

	if (Argument *Arg = dyn_cast<Argument>(Condition)) {
	BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
	return BinaryOperator::CreateNot(Condition,
	Arg->getName() + ".inv",
	EntryBlock.getTerminator());
	}

	llvm_unreachable("Unhandled condition to invert");
	}

	/// \brief Build the condition for one edge
	Value StructurizeCFG::buildCondition(BranchInst Term, unsigned Idx,
	bool Invert) {
	Value *Cond = Invert ? BoolFalse : BoolTrue;
	if (Term->isConditional()) {
	Cond = Term->getCondition();

	if (Idx != (unsigned)Invert)
	Cond = invert(Cond);
	}
	return Cond;
	}

	/// \brief Analyze the predecessors of each block and build up predicates
	void StructurizeCFG::gatherPredicates(RegionNode *N) {
	RegionInfo *RI = ParentRegion->getRegionInfo();
	BasicBlock *BB = N->getEntry();
	BBPredicates &Pred = Predicates[BB];
	BBPredicates &LPred = LoopPreds[BB];

	for (BasicBlock *P : predecessors(BB)) {
	// Ignore it if it's a branch from outside into our region entry
	if (!ParentRegion->contains(P))
	continue;

	Region *R = RI->getRegionFor(P);
	if (R == ParentRegion) {
	// It's a top level block in our region
	BranchInst *Term = cast<BranchInst>(P->getTerminator());
	for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
	BasicBlock *Succ = Term->getSuccessor(i);
	if (Succ != BB)
	continue;

	if (Visited.count(P)) {
	// Normal forward edge
	if (Term->isConditional()) {
	// Try to treat it like an ELSE block
	BasicBlock *Other = Term->getSuccessor(!i);
	if (Visited.count(Other) && !Loops.count(Other) &&
	!Pred.count(Other) && !Pred.count(P)) {

	Pred[Other] = BoolFalse;
	Pred[P] = BoolTrue;
	continue;
	}
	}
	Pred[P] = buildCondition(Term, i, false);
	} else {
	// Back edge
	LPred[P] = buildCondition(Term, i, true);
	}
	}
	} else {
	// It's an exit from a sub region
	while (R->getParent() != ParentRegion)
	R = R->getParent();

	// Edge from inside a subregion to its entry, ignore it
	if (R == N)
	continue;

	BasicBlock *Entry = R->getEntry();
	if (Visited.count(Entry))
	Pred[Entry] = BoolTrue;
	else
	LPred[Entry] = BoolFalse;
	}
	}
	}

	/// \brief Collect various loop and predicate infos
	-void StructurizeCFG::collectInfos() {
	- // Reset predicate
	- Predicates.clear();
	+void StructurizeCFG::analyzeNode(RegionNode *RN) {
	+ DEBUG(dbgs() << "Visiting: "
	+ << (RN->isSubRegion() ? "SubRegion with entry: " : "")
	+ << RN->getEntry()->getName() << '\n');

	- // and loop infos
	- Loops.clear();
	- LoopPreds.clear();
	+ // Analyze all the conditions leading to a node
	+ gatherPredicates(RN);

	- // Reset the visited nodes
	- Visited.clear();
	+ // Remember that we've seen this node
	+ Visited.insert(RN->getEntry());

	- for (RegionNode *RN : reverse(Order)) {
	- DEBUG(dbgs() << "Visiting: "
	- << (RN->isSubRegion() ? "SubRegion with entry: " : "")
	- << RN->getEntry()->getName() << " Loop Depth: "
	- << LI->getLoopDepth(RN->getEntry()) << "\n");
	-
	- // Analyze all the conditions leading to a node
	- gatherPredicates(RN);
	-
	- // Remember that we've seen this node
	- Visited.insert(RN->getEntry());
	-
	- // Find the last back edges
	- analyzeLoops(RN);
	- }
	+ // Find the last back edges
	+ analyzeLoops(RN);
	}

	/// \brief Insert the missing branch conditions
	void StructurizeCFG::insertConditions(bool Loops) {
	BranchVector &Conds = Loops ? LoopConds : Conditions;
	Value *Default = Loops ? BoolTrue : BoolFalse;
	SSAUpdater PhiInserter;

	for (BranchInst *Term : Conds) {
	assert(Term->isConditional());

	BasicBlock *Parent = Term->getParent();
	BasicBlock *SuccTrue = Term->getSuccessor(0);
	BasicBlock *SuccFalse = Term->getSuccessor(1);

	PhiInserter.Initialize(Boolean, "");
	PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
	PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);

	BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];

	NearestCommonDominator Dominator(DT);
	Dominator.addBlock(Parent);

	Value *ParentValue = nullptr;
	for (std::pair<BasicBlock , Value > BBAndPred : Preds) {
	BasicBlock *BB = BBAndPred.first;
	Value *Pred = BBAndPred.second;

	if (BB == Parent) {
	ParentValue = Pred;
	break;
	}
	PhiInserter.AddAvailableValue(BB, Pred);
	Dominator.addAndRememberBlock(BB);
	}

	if (ParentValue) {
	Term->setCondition(ParentValue);
	} else {
	if (!Dominator.resultIsRememberedBlock())
	PhiInserter.AddAvailableValue(Dominator.result(), Default);

	Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
	}
	}
	}

	/// \brief Remove all PHI values coming from "From" into "To" and remember
	/// them in DeletedPhis
	void StructurizeCFG::delPhiValues(BasicBlock From, BasicBlock To) {
	PhiMap &Map = DeletedPhis[To];
	for (PHINode &Phi : To->phis()) {
	while (Phi.getBasicBlockIndex(From) != -1) {
	Value *Deleted = Phi.removeIncomingValue(From, false);
	Map[&Phi].push_back(std::make_pair(From, Deleted));
	}
	}
	}

	/// \brief Add a dummy PHI value as soon as we knew the new predecessor
	void StructurizeCFG::addPhiValues(BasicBlock From, BasicBlock To) {
	for (PHINode &Phi : To->phis()) {
	Value *Undef = UndefValue::get(Phi.getType());
	Phi.addIncoming(Undef, From);
	}
	AddedPhis[To].push_back(From);
	}

	/// \brief Add the real PHI value as soon as everything is set up
	void StructurizeCFG::setPhiValues() {
	SSAUpdater Updater;
	for (const auto &AddedPhi : AddedPhis) {
	BasicBlock *To = AddedPhi.first;
	const BBVector &From = AddedPhi.second;

	if (!DeletedPhis.count(To))
	continue;

	PhiMap &Map = DeletedPhis[To];
	for (const auto &PI : Map) {
	PHINode *Phi = PI.first;
	Value *Undef = UndefValue::get(Phi->getType());
	Updater.Initialize(Phi->getType(), "");
	Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
	Updater.AddAvailableValue(To, Undef);

	NearestCommonDominator Dominator(DT);
	Dominator.addBlock(To);
	for (const auto &VI : PI.second) {
	Updater.AddAvailableValue(VI.first, VI.second);
	Dominator.addAndRememberBlock(VI.first);
	}

	if (!Dominator.resultIsRememberedBlock())
	Updater.AddAvailableValue(Dominator.result(), Undef);

	for (BasicBlock *FI : From) {
	int Idx = Phi->getBasicBlockIndex(FI);
	assert(Idx != -1);
	Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI));
	}
	}

	DeletedPhis.erase(To);
	}
	assert(DeletedPhis.empty());
	}

	/// \brief Remove phi values from all successors and then remove the terminator.
	void StructurizeCFG::killTerminator(BasicBlock *BB) {
	TerminatorInst *Term = BB->getTerminator();
	if (!Term)
	return;

	for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
	SI != SE; ++SI)
	delPhiValues(BB, *SI);

	Term->eraseFromParent();
	}

	/// \brief Let node exit(s) point to NewExit
	void StructurizeCFG::changeExit(RegionNode Node, BasicBlock NewExit,
	bool IncludeDominator) {
	if (Node->isSubRegion()) {
	Region *SubRegion = Node->getNodeAs<Region>();
	BasicBlock *OldExit = SubRegion->getExit();
	BasicBlock *Dominator = nullptr;

	// Find all the edges from the sub region to the exit
	for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
	// Incrememt BBI before mucking with BB's terminator.
	BasicBlock BB = BBI++;

	if (!SubRegion->contains(BB))
	continue;

	// Modify the edges to point to the new exit
	delPhiValues(BB, OldExit);
	BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
	addPhiValues(BB, NewExit);

	// Find the new dominator (if requested)
	if (IncludeDominator) {
	if (!Dominator)
	Dominator = BB;
	else
	Dominator = DT->findNearestCommonDominator(Dominator, BB);
	}
	}

	// Change the dominator (if requested)
	if (Dominator)
	DT->changeImmediateDominator(NewExit, Dominator);

	// Update the region info
	SubRegion->replaceExit(NewExit);
	} else {
	BasicBlock *BB = Node->getNodeAs<BasicBlock>();
	killTerminator(BB);
	BranchInst::Create(NewExit, BB);
	addPhiValues(BB, NewExit);
	if (IncludeDominator)
	DT->changeImmediateDominator(NewExit, BB);
	}
	}

	/// \brief Create a new flow node and update dominator tree and region info
	BasicBlock StructurizeCFG::getNextFlow(BasicBlock Dominator) {
	LLVMContext &Context = Func->getContext();
	BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
	- Order.back()->getEntry();
	+ Order.front()->getEntry();
	BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
	Func, Insert);
	DT->addNewBlock(Flow, Dominator);
	ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
	return Flow;
	}

	/// \brief Create a new or reuse the previous node as flow node
	BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
	BasicBlock *Entry = PrevNode->getEntry();

	if (!PrevNode->isSubRegion()) {
	killTerminator(Entry);
	if (!NeedEmpty \|\| Entry->getFirstInsertionPt() == Entry->end())
	return Entry;
	}

	// create a new flow node
	BasicBlock *Flow = getNextFlow(Entry);

	// and wire it up
	changeExit(PrevNode, Flow, true);
	PrevNode = ParentRegion->getBBNode(Flow);
	return Flow;
	}

	/// \brief Returns the region exit if possible, otherwise just a new flow node
	BasicBlock StructurizeCFG::needPostfix(BasicBlock Flow,
	bool ExitUseAllowed) {
	if (!Order.empty() \|\| !ExitUseAllowed)
	return getNextFlow(Flow);

	BasicBlock *Exit = ParentRegion->getExit();
	DT->changeImmediateDominator(Exit, Flow);
	addPhiValues(Flow, Exit);
	return Exit;
	}

	/// \brief Set the previous node
	void StructurizeCFG::setPrevNode(BasicBlock *BB) {
	PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
	: nullptr;
	}

	/// \brief Does BB dominate all the predicates of Node?
	bool StructurizeCFG::dominatesPredicates(BasicBlock BB, RegionNode Node) {
	BBPredicates &Preds = Predicates[Node->getEntry()];
	return llvm::all_of(Preds, [&](std::pair<BasicBlock , Value > Pred) {
	return DT->dominates(BB, Pred.first);
	});
	}

	/// \brief Can we predict that this node will always be called?
	bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
	BBPredicates &Preds = Predicates[Node->getEntry()];
	bool Dominated = false;

	// Regionentry is always true
	if (!PrevNode)
	return true;

	for (std::pair<BasicBlock, Value> Pred : Preds) {
	BasicBlock *BB = Pred.first;
	Value *V = Pred.second;

	if (V != BoolTrue)
	return false;

	if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
	Dominated = true;
	}

	// TODO: The dominator check is too strict
	return Dominated;
	}

	/// Take one node from the order vector and wire it up
	void StructurizeCFG::wireFlow(bool ExitUseAllowed,
	BasicBlock *LoopEnd) {
	- RegionNode *Node = Order.pop_back_val();
	+ RegionNode *Node = Order.front();
	+ Order.pop_front();
	Visited.insert(Node->getEntry());

	if (isPredictableTrue(Node)) {
	// Just a linear flow
	if (PrevNode) {
	changeExit(PrevNode, Node->getEntry(), true);
	}
	PrevNode = Node;
	} else {
	// Insert extra prefix node (or reuse last one)
	BasicBlock *Flow = needPrefix(false);

	// Insert extra postfix node (or use exit instead)
	BasicBlock *Entry = Node->getEntry();
	BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);

	// let it point to entry and next block
	Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
	addPhiValues(Flow, Entry);
	DT->changeImmediateDominator(Entry, Flow);

	PrevNode = Node;
	while (!Order.empty() && !Visited.count(LoopEnd) &&
	- dominatesPredicates(Entry, Order.back())) {
	+ dominatesPredicates(Entry, Order.front())) {
	handleLoops(false, LoopEnd);
	}

	changeExit(PrevNode, Next, false);
	setPrevNode(Next);
	}
	}

	void StructurizeCFG::handleLoops(bool ExitUseAllowed,
	BasicBlock *LoopEnd) {
	- RegionNode *Node = Order.back();
	+ RegionNode *Node = Order.front();
	BasicBlock *LoopStart = Node->getEntry();

	if (!Loops.count(LoopStart)) {
	wireFlow(ExitUseAllowed, LoopEnd);
	return;
	}

	if (!isPredictableTrue(Node))
	LoopStart = needPrefix(true);

	LoopEnd = Loops[Node->getEntry()];
	wireFlow(false, LoopEnd);
	while (!Visited.count(LoopEnd)) {
	handleLoops(false, LoopEnd);
	}

	// If the start of the loop is the entry block, we can't branch to it so
	// insert a new dummy entry block.
	Function *LoopFunc = LoopStart->getParent();
	if (LoopStart == &LoopFunc->getEntryBlock()) {
	LoopStart->setName("entry.orig");

	BasicBlock *NewEntry =
	BasicBlock::Create(LoopStart->getContext(),
	"entry",
	LoopFunc,
	LoopStart);
	BranchInst::Create(LoopStart, NewEntry);
	DT->setNewRoot(NewEntry);
	}

	// Create an extra loop end node
	LoopEnd = needPrefix(false);
	BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
	LoopConds.push_back(BranchInst::Create(Next, LoopStart,
	BoolUndef, LoopEnd));
	addPhiValues(LoopEnd, LoopStart);
	setPrevNode(Next);
	}

	/// After this function control flow looks like it should be, but
	/// branches and PHI nodes only have undefined conditions.
	void StructurizeCFG::createFlow() {
	BasicBlock *Exit = ParentRegion->getExit();
	bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);

	DeletedPhis.clear();
	AddedPhis.clear();
	Conditions.clear();
	LoopConds.clear();

	PrevNode = nullptr;
	Visited.clear();

	while (!Order.empty()) {
	handleLoops(EntryDominatesExit, nullptr);
	}

	if (PrevNode)
	changeExit(PrevNode, Exit, EntryDominatesExit);
	else
	assert(EntryDominatesExit);
	}

	/// Handle a rare case where the disintegrated nodes instructions
	/// no longer dominate all their uses. Not sure if this is really nessasary
	void StructurizeCFG::rebuildSSA() {
	SSAUpdater Updater;
	for (BasicBlock *BB : ParentRegion->blocks())
	for (Instruction &I : *BB) {
	bool Initialized = false;
	// We may modify the use list as we iterate over it, so be careful to
	// compute the next element in the use list at the top of the loop.
	for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
	Use &U = *UI++;
	Instruction *User = cast<Instruction>(U.getUser());
	if (User->getParent() == BB) {
	continue;
	} else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
	if (UserPN->getIncomingBlock(U) == BB)
	continue;
	}

	if (DT->dominates(&I, User))
	continue;

	if (!Initialized) {
	Value *Undef = UndefValue::get(I.getType());
	Updater.Initialize(I.getType(), "");
	Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
	Updater.AddAvailableValue(BB, &I);
	Initialized = true;
	}
	Updater.RewriteUseAfterInsertions(U);
	}
	}
	}

	static bool hasOnlyUniformBranches(const Region *R,
	const DivergenceAnalysis &DA) {
	for (const BasicBlock *BB : R->blocks()) {
	const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
	if (!Br \|\| !Br->isConditional())
	continue;

	if (!DA.isUniform(Br->getCondition()))
	return false;
	DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
	}
	return true;
	}

	/// \brief Run the transformation for each region found
	bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
	if (R->isTopLevelRegion())
	return false;

	if (SkipUniformRegions) {
	// TODO: We could probably be smarter here with how we handle sub-regions.
	auto &DA = getAnalysis<DivergenceAnalysis>();
	if (hasOnlyUniformBranches(R, DA)) {
	DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');

	// Mark all direct child block terminators as having been treated as
	// uniform. To account for a possible future in which non-uniform
	// sub-regions are treated more cleverly, indirect children are not
	// marked as uniform.
	MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
	for (RegionNode *E : R->elements()) {
	if (E->isSubRegion())
	continue;

	if (Instruction *Term = E->getEntry()->getTerminator())
	Term->setMetadata("structurizecfg.uniform", MD);
	}

	return false;
	}
	}

	Func = R->getEntry()->getParent();
	ParentRegion = R;

	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

	orderNodes();
	- collectInfos();
	+
	createFlow();
	insertConditions(false);
	insertConditions(true);
	setPhiValues();
	rebuildSSA();

	// Cleanup
	Order.clear();
	Visited.clear();
	DeletedPhis.clear();
	AddedPhis.clear();
	Predicates.clear();
	Conditions.clear();
	Loops.clear();
	LoopPreds.clear();
	LoopConds.clear();

	return true;
	}

	Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
	return new StructurizeCFG(SkipUniformRegions);
	}
	Index: vendor/llvm/dist-release_60/lib/Transforms/Vectorize/LoopVectorize.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 328362)
	@@ -1,8821 +1,8826 @@
	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
	// and generates target-independent LLVM-IR.
	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
	// of instructions in order to estimate the profitability of vectorization.
	//
	// The loop vectorizer combines consecutive loop iterations into a single
	// 'wide' iteration. After this transformation the index is incremented
	// by the SIMD vector width, and not by one.
	//
	// This pass has three parts:
	// 1. The main loop pass that drives the different parts.
	// 2. LoopVectorizationLegality - A unit that checks for the legality
	// of the vectorization.
	// 3. InnerLoopVectorizer - A unit that performs the actual
	// widening of instructions.
	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
	// of vectorization. It decides on the optimal vector width, which
	// can be one, if vectorization is not profitable.
	//
	//===----------------------------------------------------------------------===//
	//
	// The reduction-variable vectorization is based on the paper:
	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
	//
	// Variable uniformity checks are inspired by:
	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
	//
	// The interleaved access vectorization is based on the paper:
	// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
	// Data for SIMD
	//
	// Other ideas/concepts are from:
	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
	//
	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
	// Vectorizing Compilers.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
	#include "VPlan.h"
	#include "VPlanBuilder.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseMapInfo.h"
	#include "llvm/ADT/Hashing.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SCCIterator.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/BasicAliasAnalysis.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/CodeMetrics.h"
	#include "llvm/Analysis/DemandedBits.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/LoopAccessAnalysis.h"
	#include "llvm/Analysis/LoopAnalysisManager.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/LoopIterator.h"
	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/Analysis/ScalarEvolutionExpander.h"
	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/LoopSimplify.h"
	#include "llvm/Transforms/Utils/LoopUtils.h"
	#include "llvm/Transforms/Utils/LoopVersioning.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <functional>
	#include <iterator>
	#include <limits>
	#include <memory>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define LV_NAME "loop-vectorize"
	#define DEBUG_TYPE LV_NAME

	STATISTIC(LoopsVectorized, "Number of loops vectorized");
	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");

	static cl::opt<bool>
	EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
	cl::desc("Enable if-conversion during vectorization."));

	/// Loops with a known constant trip count below this number are vectorized only
	/// if no scalar iteration overheads are incurred.
	static cl::opt<unsigned> TinyTripCountVectorThreshold(
	"vectorizer-min-trip-count", cl::init(16), cl::Hidden,
	cl::desc("Loops with a constant trip count that is smaller than this "
	"value are vectorized only if no scalar iteration overheads "
	"are incurred."));

	static cl::opt<bool> MaximizeBandwidth(
	"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
	cl::desc("Maximize bandwidth when selecting vectorization factor which "
	"will be determined by the smallest type in loop."));

	static cl::opt<bool> EnableInterleavedMemAccesses(
	"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
	cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

	/// Maximum factor for an interleaved memory access.
	static cl::opt<unsigned> MaxInterleaveGroupFactor(
	"max-interleave-group-factor", cl::Hidden,
	cl::desc("Maximum factor for an interleaved access group (default = 8)"),
	cl::init(8));

	/// We don't interleave loops with a known constant trip count below this
	/// number.
	static const unsigned TinyTripCountInterleaveThreshold = 128;

	static cl::opt<unsigned> ForceTargetNumScalarRegs(
	"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
	cl::desc("A flag that overrides the target's number of scalar registers."));

	static cl::opt<unsigned> ForceTargetNumVectorRegs(
	"force-target-num-vector-regs", cl::init(0), cl::Hidden,
	cl::desc("A flag that overrides the target's number of vector registers."));

	/// Maximum vectorization interleave count.
	static const unsigned MaxInterleaveFactor = 16;

	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
	"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
	cl::desc("A flag that overrides the target's max interleave factor for "
	"scalar loops."));

	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
	"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
	cl::desc("A flag that overrides the target's max interleave factor for "
	"vectorized loops."));

	static cl::opt<unsigned> ForceTargetInstructionCost(
	"force-target-instruction-cost", cl::init(0), cl::Hidden,
	cl::desc("A flag that overrides the target's expected cost for "
	"an instruction to a single constant value. Mostly "
	"useful for getting consistent testing."));

	static cl::opt<unsigned> SmallLoopCost(
	"small-loop-cost", cl::init(20), cl::Hidden,
	cl::desc(
	"The cost of a loop that is considered 'small' by the interleaver."));

	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
	"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
	cl::desc("Enable the use of the block frequency analysis to access PGO "
	"heuristics minimizing code growth in cold regions and being more "
	"aggressive in hot regions."));

	// Runtime interleave loops for load/store throughput.
	static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
	"enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
	cl::desc(
	"Enable runtime interleaving until load/store ports are saturated"));

	/// The number of stores in a loop that are allowed to need predication.
	static cl::opt<unsigned> NumberOfStoresToPredicate(
	"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
	cl::desc("Max number of stores to be predicated behind an if."));

	static cl::opt<bool> EnableIndVarRegisterHeur(
	"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
	cl::desc("Count the induction variable only once when interleaving"));

	static cl::opt<bool> EnableCondStoresVectorization(
	"enable-cond-stores-vec", cl::init(true), cl::Hidden,
	cl::desc("Enable if predication of stores during vectorization."));

	static cl::opt<unsigned> MaxNestedScalarReductionIC(
	"max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
	cl::desc("The maximum interleave count to use when interleaving a scalar "
	"reduction in a nested loop."));

	static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
	"pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
	cl::desc("The maximum allowed number of runtime memory checks with a "
	"vectorize(enable) pragma."));

	static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
	"vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
	cl::desc("The maximum number of SCEV checks allowed."));

	static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
	"pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
	cl::desc("The maximum number of SCEV checks allowed with a "
	"vectorize(enable) pragma"));

	/// Create an analysis remark that explains why vectorization failed
	///
	/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
	/// RemarkName is the identifier for the remark. If \p I is passed it is an
	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
	/// the location of the remark. \return the remark object that can be
	/// streamed to.
	static OptimizationRemarkAnalysis
	createMissedAnalysis(const char PassName, StringRef RemarkName, Loop TheLoop,
	Instruction *I = nullptr) {
	Value *CodeRegion = TheLoop->getHeader();
	DebugLoc DL = TheLoop->getStartLoc();

	if (I) {
	CodeRegion = I->getParent();
	// If there is no debug location attached to the instruction, revert back to
	// using the loop's.
	if (I->getDebugLoc())
	DL = I->getDebugLoc();
	}

	OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
	R << "loop not vectorized: ";
	return R;
	}

	namespace {

	class LoopVectorizationLegality;
	class LoopVectorizationCostModel;
	class LoopVectorizationRequirements;

	} // end anonymous namespace

	/// Returns true if the given loop body has a cycle, excluding the loop
	/// itself.
	static bool hasCyclesInLoopBody(const Loop &L) {
	if (!L.empty())
	return true;

	for (const auto &SCC :
	make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
	scc_iterator<Loop, LoopBodyTraits>::end(L))) {
	if (SCC.size() > 1) {
	DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
	DEBUG(L.dump());
	return true;
	}
	}
	return false;
	}

	/// A helper function for converting Scalar types to vector types.
	/// If the incoming type is void, we return void. If the VF is 1, we return
	/// the scalar type.
	static Type ToVectorTy(Type Scalar, unsigned VF) {
	if (Scalar->isVoidTy() \|\| VF == 1)
	return Scalar;
	return VectorType::get(Scalar, VF);
	}

	// FIXME: The following helper functions have multiple implementations
	// in the project. They can be effectively organized in a common Load/Store
	// utilities unit.

	/// A helper function that returns the pointer operand of a load or store
	/// instruction.
	static Value getPointerOperand(Value I) {
	if (auto *LI = dyn_cast<LoadInst>(I))
	return LI->getPointerOperand();
	if (auto *SI = dyn_cast<StoreInst>(I))
	return SI->getPointerOperand();
	return nullptr;
	}

	/// A helper function that returns the type of loaded or stored value.
	static Type getMemInstValueType(Value I) {
	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
	"Expected Load or Store instruction");
	if (auto *LI = dyn_cast<LoadInst>(I))
	return LI->getType();
	return cast<StoreInst>(I)->getValueOperand()->getType();
	}

	/// A helper function that returns the alignment of load or store instruction.
	static unsigned getMemInstAlignment(Value *I) {
	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
	"Expected Load or Store instruction");
	if (auto *LI = dyn_cast<LoadInst>(I))
	return LI->getAlignment();
	return cast<StoreInst>(I)->getAlignment();
	}

	/// A helper function that returns the address space of the pointer operand of
	/// load or store instruction.
	static unsigned getMemInstAddressSpace(Value *I) {
	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
	"Expected Load or Store instruction");
	if (auto *LI = dyn_cast<LoadInst>(I))
	return LI->getPointerAddressSpace();
	return cast<StoreInst>(I)->getPointerAddressSpace();
	}

	/// A helper function that returns true if the given type is irregular. The
	/// type is irregular if its allocated size doesn't equal the store size of an
	/// element of the corresponding vector type at the given vectorization factor.
	static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
	// Determine if an array of VF elements of type Ty is "bitcast compatible"
	// with a <VF x Ty> vector.
	if (VF > 1) {
	auto *VectorTy = VectorType::get(Ty, VF);
	return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
	}

	// If the vectorization factor is one, we just check if an array of type Ty
	// requires padding between elements.
	return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
	}

	/// A helper function that returns the reciprocal of the block probability of
	/// predicated blocks. If we return X, we are assuming the predicated block
	/// will execute once for for every X iterations of the loop header.
	///
	/// TODO: We should use actual block probability here, if available. Currently,
	/// we always assume predicated blocks have a 50% chance of executing.
	static unsigned getReciprocalPredBlockProb() { return 2; }

	/// A helper function that adds a 'fast' flag to floating-point operations.
	static Value addFastMathFlag(Value V) {
	if (isa<FPMathOperator>(V)) {
	FastMathFlags Flags;
	Flags.setFast();
	cast<Instruction>(V)->setFastMathFlags(Flags);
	}
	return V;
	}

	/// A helper function that returns an integer or floating-point constant with
	/// value C.
	static Constant getSignedIntOrFpConstant(Type Ty, int64_t C) {
	return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
	: ConstantFP::get(Ty, C);
	}

	namespace llvm {

	/// InnerLoopVectorizer vectorizes loops which contain only one basic
	/// block to a specified vectorization factor (VF).
	/// This class performs the widening of scalars into vectors, or multiple
	/// scalars. This class also implements the following features:
	/// * It inserts an epilogue loop for handling loops that don't have iteration
	/// counts that are known to be a multiple of the vectorization factor.
	/// * It handles the code generation for reduction variables.
	/// * Scalarization (implementation using scalars) of un-vectorizable
	/// instructions.
	/// InnerLoopVectorizer does not perform any vectorization-legality
	/// checks, and relies on the caller to check for the different legality
	/// aspects. The InnerLoopVectorizer relies on the
	/// LoopVectorizationLegality class to provide information about the induction
	/// and reduction variables that were found to a given vectorization factor.
	class InnerLoopVectorizer {
	public:
	InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
	LoopInfo LI, DominatorTree DT,
	const TargetLibraryInfo *TLI,
	const TargetTransformInfo TTI, AssumptionCache AC,
	OptimizationRemarkEmitter *ORE, unsigned VecWidth,
	unsigned UnrollFactor, LoopVectorizationLegality *LVL,
	LoopVectorizationCostModel *CM)
	: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
	AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
	Builder(PSE.getSE()->getContext()),
	VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
	virtual ~InnerLoopVectorizer() = default;

	/// Create a new empty loop. Unlink the old loop and connect the new one.
	/// Return the pre-header block of the new loop.
	BasicBlock *createVectorizedLoopSkeleton();

	/// Widen a single instruction within the innermost loop.
	void widenInstruction(Instruction &I);

	/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
	void fixVectorizedLoop();

	// Return true if any runtime check is added.
	bool areSafetyChecksAdded() { return AddedSafetyChecks; }

	/// A type for vectorized values in the new loop. Each value from the
	/// original loop, when vectorized, is represented by UF vector values in the
	/// new unrolled loop, where UF is the unroll factor.
	using VectorParts = SmallVector<Value *, 2>;

	/// Vectorize a single PHINode in a block. This method handles the induction
	/// variable canonicalization. It supports both VF = 1 for unrolled loops and
	/// arbitrary length vectors.
	void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);

	/// A helper function to scalarize a single Instruction in the innermost loop.
	/// Generates a sequence of scalar instances for each lane between \p MinLane
	/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
	/// inclusive..
	void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
	bool IfPredicateInstr);

	/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
	/// is provided, the integer induction variable will first be truncated to
	/// the corresponding type.
	void widenIntOrFpInduction(PHINode IV, TruncInst Trunc = nullptr);

	/// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
	/// vector or scalar value on-demand if one is not yet available. When
	/// vectorizing a loop, we visit the definition of an instruction before its
	/// uses. When visiting the definition, we either vectorize or scalarize the
	/// instruction, creating an entry for it in the corresponding map. (In some
	/// cases, such as induction variables, we will create both vector and scalar
	/// entries.) Then, as we encounter uses of the definition, we derive values
	/// for each scalar or vector use unless such a value is already available.
	/// For example, if we scalarize a definition and one of its uses is vector,
	/// we build the required vector on-demand with an insertelement sequence
	/// when visiting the use. Otherwise, if the use is scalar, we can use the
	/// existing scalar definition.
	///
	/// Return a value in the new loop corresponding to \p V from the original
	/// loop at unroll index \p Part. If the value has already been vectorized,
	/// the corresponding vector entry in VectorLoopValueMap is returned. If,
	/// however, the value has a scalar entry in VectorLoopValueMap, we construct
	/// a new vector value on-demand by inserting the scalar values into a vector
	/// with an insertelement sequence. If the value has been neither vectorized
	/// nor scalarized, it must be loop invariant, so we simply broadcast the
	/// value into a vector.
	Value getOrCreateVectorValue(Value V, unsigned Part);

	/// Return a value in the new loop corresponding to \p V from the original
	/// loop at unroll and vector indices \p Instance. If the value has been
	/// vectorized but not scalarized, the necessary extractelement instruction
	/// will be generated.
	Value getOrCreateScalarValue(Value V, const VPIteration &Instance);

	/// Construct the vector value of a scalarized value \p V one lane at a time.
	void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);

	/// Try to vectorize the interleaved access group that \p Instr belongs to.
	void vectorizeInterleaveGroup(Instruction *Instr);

	/// Vectorize Load and Store instructions, optionally masking the vector
	/// operations if \p BlockInMask is non-null.
	void vectorizeMemoryInstruction(Instruction *Instr,
	VectorParts *BlockInMask = nullptr);

	/// \brief Set the debug location in the builder using the debug location in
	/// the instruction.
	void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);

	protected:
	friend class LoopVectorizationPlanner;

	/// A small list of PHINodes.
	using PhiVector = SmallVector<PHINode *, 4>;

	/// A type for scalarized values in the new loop. Each value from the
	/// original loop, when scalarized, is represented by UF x VF scalar values
	/// in the new unrolled loop, where UF is the unroll factor and VF is the
	/// vectorization factor.
	using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;

	/// Set up the values of the IVs correctly when exiting the vector loop.
	void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
	Value CountRoundDown, Value EndValue,
	BasicBlock *MiddleBlock);

	/// Create a new induction variable inside L.
	PHINode createInductionVariable(Loop L, Value Start, Value End,
	Value Step, Instruction DL);

	/// Handle all cross-iteration phis in the header.
	void fixCrossIterationPHIs();

	/// Fix a first-order recurrence. This is the second phase of vectorizing
	/// this phi node.
	void fixFirstOrderRecurrence(PHINode *Phi);

	/// Fix a reduction cross-iteration phi. This is the second phase of
	/// vectorizing this phi node.
	void fixReduction(PHINode *Phi);

	/// \brief The Loop exit block may have single value PHI nodes with some
	/// incoming value. While vectorizing we only handled real values
	/// that were defined inside the loop and we should have one value for
	/// each predecessor of its parent basic block. See PR14725.
	void fixLCSSAPHIs();

	/// Iteratively sink the scalarized operands of a predicated instruction into
	/// the block that was created for it.
	void sinkScalarOperands(Instruction *PredInst);

	/// Shrinks vector element sizes to the smallest bitwidth they can be legally
	/// represented as.
	void truncateToMinimalBitwidths();

	/// Insert the new loop to the loop hierarchy and pass manager
	/// and update the analysis passes.
	void updateAnalysis();

	/// Create a broadcast instruction. This method generates a broadcast
	/// instruction (shuffle) for loop invariant values and for the induction
	/// value. If this is the induction variable then we extend it to N, N+1, ...
	/// this is needed because each iteration in the loop corresponds to a SIMD
	/// element.
	virtual Value getBroadcastInstrs(Value V);

	/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
	/// to each vector element of Val. The sequence starts at StartIndex.
	/// \p Opcode is relevant for FP induction variable.
	virtual Value getStepVector(Value Val, int StartIdx, Value *Step,
	Instruction::BinaryOps Opcode =
	Instruction::BinaryOpsEnd);

	/// Compute scalar induction steps. \p ScalarIV is the scalar induction
	/// variable on which to base the steps, \p Step is the size of the step, and
	/// \p EntryVal is the value from the original loop that maps to the steps.
	/// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
	/// can be a truncate instruction).
	void buildScalarSteps(Value ScalarIV, Value Step, Value *EntryVal,
	const InductionDescriptor &ID);

	/// Create a vector induction phi node based on an existing scalar one. \p
	/// EntryVal is the value from the original loop that maps to the vector phi
	/// node, and \p Step is the loop-invariant step. If \p EntryVal is a
	/// truncate instruction, instead of widening the original IV, we widen a
	/// version of the IV truncated to \p EntryVal's type.
	void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
	Value Step, Instruction EntryVal);

	/// Returns true if an instruction \p I should be scalarized instead of
	/// vectorized for the chosen vectorization factor.
	bool shouldScalarizeInstruction(Instruction *I) const;

	/// Returns true if we should generate a scalar version of \p IV.
	bool needsScalarInduction(Instruction *IV) const;

	/// If there is a cast involved in the induction variable \p ID, which should
	/// be ignored in the vectorized loop body, this function records the
	/// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
	/// cast. We had already proved that the casted Phi is equal to the uncasted
	/// Phi in the vectorized loop (under a runtime guard), and therefore
	/// there is no need to vectorize the cast - the same value can be used in the
	/// vector loop for both the Phi and the cast.
	/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
	/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
	void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID,
	Value *VectorLoopValue,
	unsigned Part,
	unsigned Lane = UINT_MAX);

	/// Generate a shuffle sequence that will reverse the vector Vec.
	virtual Value reverseVector(Value Vec);

	/// Returns (and creates if needed) the original loop trip count.
	Value getOrCreateTripCount(Loop NewLoop);

	/// Returns (and creates if needed) the trip count of the widened loop.
	Value getOrCreateVectorTripCount(Loop NewLoop);

	/// Returns a bitcasted value to the requested vector type.
	/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
	Value createBitOrPointerCast(Value V, VectorType *DstVTy,
	const DataLayout &DL);

	/// Emit a bypass check to see if the vector trip count is zero, including if
	/// it overflows.
	void emitMinimumIterationCountCheck(Loop L, BasicBlock Bypass);

	/// Emit a bypass check to see if all of the SCEV assumptions we've
	/// had to make are correct.
	void emitSCEVChecks(Loop L, BasicBlock Bypass);

	/// Emit bypass checks to check any memory assumptions we may have made.
	void emitMemRuntimeChecks(Loop L, BasicBlock Bypass);

	/// Add additional metadata to \p To that was not present on \p Orig.
	///
	/// Currently this is used to add the noalias annotations based on the
	/// inserted memchecks. Use this for instructions that are cloned into the
	/// vector loop.
	void addNewMetadata(Instruction To, const Instruction Orig);

	/// Add metadata from one instruction to another.
	///
	/// This includes both the original MDs from \p From and additional ones (\see
	/// addNewMetadata). Use this for newly created instructions in the vector
	/// loop.
	void addMetadata(Instruction To, Instruction From);

	/// \brief Similar to the previous function but it adds the metadata to a
	/// vector of instructions.
	void addMetadata(ArrayRef<Value > To, Instruction From);

	/// The original loop.
	Loop *OrigLoop;

	/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
	/// dynamic knowledge to simplify SCEV expressions and converts them to a
	/// more usable form.
	PredicatedScalarEvolution &PSE;

	/// Loop Info.
	LoopInfo *LI;

	/// Dominator Tree.
	DominatorTree *DT;

	/// Alias Analysis.
	AliasAnalysis *AA;

	/// Target Library Info.
	const TargetLibraryInfo *TLI;

	/// Target Transform Info.
	const TargetTransformInfo *TTI;

	/// Assumption Cache.
	AssumptionCache *AC;

	/// Interface to emit optimization remarks.
	OptimizationRemarkEmitter *ORE;

	/// \brief LoopVersioning. It's only set up (non-null) if memchecks were
	/// used.
	///
	/// This is currently only used to add no-alias metadata based on the
	/// memchecks. The actually versioning is performed manually.
	std::unique_ptr<LoopVersioning> LVer;

	/// The vectorization SIMD factor to use. Each vector will have this many
	/// vector elements.
	unsigned VF;

	/// The vectorization unroll factor to use. Each scalar is vectorized to this
	/// many different vector instructions.
	unsigned UF;

	/// The builder that we use
	IRBuilder<> Builder;

	// --- Vectorization state ---

	/// The vector-loop preheader.
	BasicBlock *LoopVectorPreHeader;

	/// The scalar-loop preheader.
	BasicBlock *LoopScalarPreHeader;

	/// Middle Block between the vector and the scalar.
	BasicBlock *LoopMiddleBlock;

	/// The ExitBlock of the scalar loop.
	BasicBlock *LoopExitBlock;

	/// The vector loop body.
	BasicBlock *LoopVectorBody;

	/// The scalar loop body.
	BasicBlock *LoopScalarBody;

	/// A list of all bypass blocks. The first block is the entry of the loop.
	SmallVector<BasicBlock *, 4> LoopBypassBlocks;

	/// The new Induction variable which was added to the new block.
	PHINode *Induction = nullptr;

	/// The induction variable of the old basic block.
	PHINode *OldInduction = nullptr;

	/// Maps values from the original loop to their corresponding values in the
	/// vectorized loop. A key value can map to either vector values, scalar
	/// values or both kinds of values, depending on whether the key was
	/// vectorized and scalarized.
	VectorizerValueMap VectorLoopValueMap;

	/// Store instructions that were predicated.
	SmallVector<Instruction *, 4> PredicatedInstructions;

	/// Trip count of the original loop.
	Value *TripCount = nullptr;

	/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
	Value *VectorTripCount = nullptr;

	/// The legality analysis.
	LoopVectorizationLegality *Legal;

	/// The profitablity analysis.
	LoopVectorizationCostModel *Cost;

	// Record whether runtime checks are added.
	bool AddedSafetyChecks = false;

	// Holds the end values for each induction variable. We save the end values
	// so we can later fix-up the external users of the induction variables.
	DenseMap<PHINode , Value > IVEndValues;
	};

	class InnerLoopUnroller : public InnerLoopVectorizer {
	public:
	InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
	LoopInfo LI, DominatorTree DT,
	const TargetLibraryInfo *TLI,
	const TargetTransformInfo TTI, AssumptionCache AC,
	OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
	LoopVectorizationLegality *LVL,
	LoopVectorizationCostModel *CM)
	: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
	UnrollFactor, LVL, CM) {}

	private:
	Value getBroadcastInstrs(Value V) override;
	Value getStepVector(Value Val, int StartIdx, Value *Step,
	Instruction::BinaryOps Opcode =
	Instruction::BinaryOpsEnd) override;
	Value reverseVector(Value Vec) override;
	};

	} // end namespace llvm

	/// \brief Look for a meaningful debug location on the instruction or it's
	/// operands.
	static Instruction getDebugLocFromInstOrOperands(Instruction I) {
	if (!I)
	return I;

	DebugLoc Empty;
	if (I->getDebugLoc() != Empty)
	return I;

	for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
	if (Instruction OpInst = dyn_cast<Instruction>(OI))
	if (OpInst->getDebugLoc() != Empty)
	return OpInst;
	}

	return I;
	}

	void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
	if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
	const DILocation *DIL = Inst->getDebugLoc();
	if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
	!isa<DbgInfoIntrinsic>(Inst))
	B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
	else
	B.SetCurrentDebugLocation(DIL);
	} else
	B.SetCurrentDebugLocation(DebugLoc());
	}

	#ifndef NDEBUG
	/// \return string containing a file name and a line # for the given loop.
	static std::string getDebugLocString(const Loop *L) {
	std::string Result;
	if (L) {
	raw_string_ostream OS(Result);
	if (const DebugLoc LoopDbgLoc = L->getStartLoc())
	LoopDbgLoc.print(OS);
	else
	// Just print the module name.
	OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
	OS.flush();
	}
	return Result;
	}
	#endif

	void InnerLoopVectorizer::addNewMetadata(Instruction *To,
	const Instruction *Orig) {
	// If the loop was versioned with memchecks, add the corresponding no-alias
	// metadata.
	if (LVer && (isa<LoadInst>(Orig) \|\| isa<StoreInst>(Orig)))
	LVer->annotateInstWithNoAlias(To, Orig);
	}

	void InnerLoopVectorizer::addMetadata(Instruction *To,
	Instruction *From) {
	propagateMetadata(To, From);
	addNewMetadata(To, From);
	}

	void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
	Instruction *From) {
	for (Value *V : To) {
	if (Instruction *I = dyn_cast<Instruction>(V))
	addMetadata(I, From);
	}
	}

	namespace llvm {

	/// \brief The group of interleaved loads/stores sharing the same stride and
	/// close to each other.
	///
	/// Each member in this group has an index starting from 0, and the largest
	/// index should be less than interleaved factor, which is equal to the absolute
	/// value of the access's stride.
	///
	/// E.g. An interleaved load group of factor 4:
	/// for (unsigned i = 0; i < 1024; i+=4) {
	/// a = A[i]; // Member of index 0
	/// b = A[i+1]; // Member of index 1
	/// d = A[i+3]; // Member of index 3
	/// ...
	/// }
	///
	/// An interleaved store group of factor 4:
	/// for (unsigned i = 0; i < 1024; i+=4) {
	/// ...
	/// A[i] = a; // Member of index 0
	/// A[i+1] = b; // Member of index 1
	/// A[i+2] = c; // Member of index 2
	/// A[i+3] = d; // Member of index 3
	/// }
	///
	/// Note: the interleaved load group could have gaps (missing members), but
	/// the interleaved store group doesn't allow gaps.
	class InterleaveGroup {
	public:
	InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
	: Align(Align), InsertPos(Instr) {
	assert(Align && "The alignment should be non-zero");

	Factor = std::abs(Stride);
	assert(Factor > 1 && "Invalid interleave factor");

	Reverse = Stride < 0;
	Members[0] = Instr;
	}

	bool isReverse() const { return Reverse; }
	unsigned getFactor() const { return Factor; }
	unsigned getAlignment() const { return Align; }
	unsigned getNumMembers() const { return Members.size(); }

	/// \brief Try to insert a new member \p Instr with index \p Index and
	/// alignment \p NewAlign. The index is related to the leader and it could be
	/// negative if it is the new leader.
	///
	/// \returns false if the instruction doesn't belong to the group.
	bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
	assert(NewAlign && "The new member's alignment should be non-zero");

	int Key = Index + SmallestKey;

	// Skip if there is already a member with the same index.
	if (Members.count(Key))
	return false;

	if (Key > LargestKey) {
	// The largest index is always less than the interleave factor.
	if (Index >= static_cast<int>(Factor))
	return false;

	LargestKey = Key;
	} else if (Key < SmallestKey) {
	// The largest index is always less than the interleave factor.
	if (LargestKey - Key >= static_cast<int>(Factor))
	return false;

	SmallestKey = Key;
	}

	// It's always safe to select the minimum alignment.
	Align = std::min(Align, NewAlign);
	Members[Key] = Instr;
	return true;
	}

	/// \brief Get the member with the given index \p Index
	///
	/// \returns nullptr if contains no such member.
	Instruction *getMember(unsigned Index) const {
	int Key = SmallestKey + Index;
	if (!Members.count(Key))
	return nullptr;

	return Members.find(Key)->second;
	}

	/// \brief Get the index for the given member. Unlike the key in the member
	/// map, the index starts from 0.
	unsigned getIndex(Instruction *Instr) const {
	for (auto I : Members)
	if (I.second == Instr)
	return I.first - SmallestKey;

	llvm_unreachable("InterleaveGroup contains no such member");
	}

	Instruction *getInsertPos() const { return InsertPos; }
	void setInsertPos(Instruction *Inst) { InsertPos = Inst; }

	/// Add metadata (e.g. alias info) from the instructions in this group to \p
	/// NewInst.
	///
	/// FIXME: this function currently does not add noalias metadata a'la
	/// addNewMedata. To do that we need to compute the intersection of the
	/// noalias info from all members.
	void addMetadata(Instruction *NewInst) const {
	SmallVector<Value *, 4> VL;
	std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
	[](std::pair<int, Instruction *> p) { return p.second; });
	propagateMetadata(NewInst, VL);
	}

	private:
	unsigned Factor; // Interleave Factor.
	bool Reverse;
	unsigned Align;
	DenseMap<int, Instruction *> Members;
	int SmallestKey = 0;
	int LargestKey = 0;

	// To avoid breaking dependences, vectorized instructions of an interleave
	// group should be inserted at either the first load or the last store in
	// program order.
	//
	// E.g. %even = load i32 // Insert Position
	// %add = add i32 %even // Use of %even
	// %odd = load i32
	//
	// store i32 %even
	// %odd = add i32 // Def of %odd
	// store i32 %odd // Insert Position
	Instruction *InsertPos;
	};
	} // end namespace llvm

	namespace {

	/// \brief Drive the analysis of interleaved memory accesses in the loop.
	///
	/// Use this class to analyze interleaved accesses only when we can vectorize
	/// a loop. Otherwise it's meaningless to do analysis as the vectorization
	/// on interleaved accesses is unsafe.
	///
	/// The analysis collects interleave groups and records the relationships
	/// between the member and the group in a map.
	class InterleavedAccessInfo {
	public:
	InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
	DominatorTree DT, LoopInfo LI)
	: PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}

	~InterleavedAccessInfo() {
	SmallSet<InterleaveGroup *, 4> DelSet;
	// Avoid releasing a pointer twice.
	for (auto &I : InterleaveGroupMap)
	DelSet.insert(I.second);
	for (auto *Ptr : DelSet)
	delete Ptr;
	}

	/// \brief Analyze the interleaved accesses and collect them in interleave
	/// groups. Substitute symbolic strides using \p Strides.
	void analyzeInterleaving(const ValueToValueMap &Strides);

	/// \brief Check if \p Instr belongs to any interleave group.
	bool isInterleaved(Instruction *Instr) const {
	return InterleaveGroupMap.count(Instr);
	}

	/// \brief Get the interleave group that \p Instr belongs to.
	///
	/// \returns nullptr if doesn't have such group.
	InterleaveGroup getInterleaveGroup(Instruction Instr) const {
	if (InterleaveGroupMap.count(Instr))
	return InterleaveGroupMap.find(Instr)->second;
	return nullptr;
	}

	/// \brief Returns true if an interleaved group that may access memory
	/// out-of-bounds requires a scalar epilogue iteration for correctness.
	bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }

	/// \brief Initialize the LoopAccessInfo used for dependence checking.
	void setLAI(const LoopAccessInfo *Info) { LAI = Info; }

	private:
	/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
	/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
	/// The interleaved access analysis can also add new predicates (for example
	/// by versioning strides of pointers).
	PredicatedScalarEvolution &PSE;

	Loop *TheLoop;
	DominatorTree *DT;
	LoopInfo *LI;
	const LoopAccessInfo *LAI = nullptr;

	/// True if the loop may contain non-reversed interleaved groups with
	/// out-of-bounds accesses. We ensure we don't speculatively access memory
	/// out-of-bounds by executing at least one scalar epilogue iteration.
	bool RequiresScalarEpilogue = false;

	/// Holds the relationships between the members and the interleave group.
	DenseMap<Instruction , InterleaveGroup > InterleaveGroupMap;

	/// Holds dependences among the memory accesses in the loop. It maps a source
	/// access to a set of dependent sink accesses.
	DenseMap<Instruction , SmallPtrSet<Instruction , 2>> Dependences;

	/// \brief The descriptor for a strided memory access.
	struct StrideDescriptor {
	StrideDescriptor() = default;
	StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
	unsigned Align)
	: Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}

	// The access's stride. It is negative for a reverse access.
	int64_t Stride = 0;

	// The scalar expression of this access.
	const SCEV *Scev = nullptr;

	// The size of the memory object.
	uint64_t Size = 0;

	// The alignment of this access.
	unsigned Align = 0;
	};

	/// \brief A type for holding instructions and their stride descriptors.
	using StrideEntry = std::pair<Instruction *, StrideDescriptor>;

	/// \brief Create a new interleave group with the given instruction \p Instr,
	/// stride \p Stride and alignment \p Align.
	///
	/// \returns the newly created interleave group.
	InterleaveGroup createInterleaveGroup(Instruction Instr, int Stride,
	unsigned Align) {
	assert(!InterleaveGroupMap.count(Instr) &&
	"Already in an interleaved access group");
	InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
	return InterleaveGroupMap[Instr];
	}

	/// \brief Release the group and remove all the relationships.
	void releaseGroup(InterleaveGroup *Group) {
	for (unsigned i = 0; i < Group->getFactor(); i++)
	if (Instruction *Member = Group->getMember(i))
	InterleaveGroupMap.erase(Member);

	delete Group;
	}

	/// \brief Collect all the accesses with a constant stride in program order.
	void collectConstStrideAccesses(
	MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
	const ValueToValueMap &Strides);

	/// \brief Returns true if \p Stride is allowed in an interleaved group.
	static bool isStrided(int Stride) {
	unsigned Factor = std::abs(Stride);
	return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
	}

	/// \brief Returns true if \p BB is a predicated block.
	bool isPredicated(BasicBlock *BB) const {
	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
	}

	/// \brief Returns true if LoopAccessInfo can be used for dependence queries.
	bool areDependencesValid() const {
	return LAI && LAI->getDepChecker().getDependences();
	}

	/// \brief Returns true if memory accesses \p A and \p B can be reordered, if
	/// necessary, when constructing interleaved groups.
	///
	/// \p A must precede \p B in program order. We return false if reordering is
	/// not necessary or is prevented because \p A and \p B may be dependent.
	bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
	StrideEntry *B) const {
	// Code motion for interleaved accesses can potentially hoist strided loads
	// and sink strided stores. The code below checks the legality of the
	// following two conditions:
	//
	// 1. Potentially moving a strided load (B) before any store (A) that
	// precedes B, or
	//
	// 2. Potentially moving a strided store (A) after any load or store (B)
	// that A precedes.
	//
	// It's legal to reorder A and B if we know there isn't a dependence from A
	// to B. Note that this determination is conservative since some
	// dependences could potentially be reordered safely.

	// A is potentially the source of a dependence.
	auto *Src = A->first;
	auto SrcDes = A->second;

	// B is potentially the sink of a dependence.
	auto *Sink = B->first;
	auto SinkDes = B->second;

	// Code motion for interleaved accesses can't violate WAR dependences.
	// Thus, reordering is legal if the source isn't a write.
	if (!Src->mayWriteToMemory())
	return true;

	// At least one of the accesses must be strided.
	if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
	return true;

	// If dependence information is not available from LoopAccessInfo,
	// conservatively assume the instructions can't be reordered.
	if (!areDependencesValid())
	return false;

	// If we know there is a dependence from source to sink, assume the
	// instructions can't be reordered. Otherwise, reordering is legal.
	return !Dependences.count(Src) \|\| !Dependences.lookup(Src).count(Sink);
	}

	/// \brief Collect the dependences from LoopAccessInfo.
	///
	/// We process the dependences once during the interleaved access analysis to
	/// enable constant-time dependence queries.
	void collectDependences() {
	if (!areDependencesValid())
	return;
	auto *Deps = LAI->getDepChecker().getDependences();
	for (auto Dep : *Deps)
	Dependences[Dep.getSource(LAI)].insert(Dep.getDestination(LAI));
	}
	};

	/// Utility class for getting and setting loop vectorizer hints in the form
	/// of loop metadata.
	/// This class keeps a number of loop annotations locally (as member variables)
	/// and can, upon request, write them back as metadata on the loop. It will
	/// initially scan the loop for existing metadata, and will update the local
	/// values based on information in the loop.
	/// We cannot write all values to metadata, as the mere presence of some info,
	/// for example 'force', means a decision has been made. So, we need to be
	/// careful NOT to add them if the user hasn't specifically asked so.
	class LoopVectorizeHints {
	enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };

	/// Hint - associates name and validation with the hint value.
	struct Hint {
	const char *Name;
	unsigned Value; // This may have to change for non-numeric values.
	HintKind Kind;

	Hint(const char *Name, unsigned Value, HintKind Kind)
	: Name(Name), Value(Value), Kind(Kind) {}

	bool validate(unsigned Val) {
	switch (Kind) {
	case HK_WIDTH:
	return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
	case HK_UNROLL:
	return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
	case HK_FORCE:
	return (Val <= 1);
	case HK_ISVECTORIZED:
	return (Val==0 \|\| Val==1);
	}
	return false;
	}
	};

	/// Vectorization width.
	Hint Width;

	/// Vectorization interleave factor.
	Hint Interleave;

	/// Vectorization forced
	Hint Force;

	/// Already Vectorized
	Hint IsVectorized;

	/// Return the loop metadata prefix.
	static StringRef Prefix() { return "llvm.loop."; }

	/// True if there is any unsafe math in the loop.
	bool PotentiallyUnsafe = false;

	public:
	enum ForceKind {
	FK_Undefined = -1, ///< Not selected.
	FK_Disabled = 0, ///< Forcing disabled.
	FK_Enabled = 1, ///< Forcing enabled.
	};

	LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
	OptimizationRemarkEmitter &ORE)
	: Width("vectorize.width", VectorizerParams::VectorizationFactor,
	HK_WIDTH),
	Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
	Force("vectorize.enable", FK_Undefined, HK_FORCE),
	IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
	// Populate values with existing loop metadata.
	getHintsFromMetadata();

	// force-vector-interleave overrides DisableInterleaving.
	if (VectorizerParams::isInterleaveForced())
	Interleave.Value = VectorizerParams::VectorizationInterleave;

	if (IsVectorized.Value != 1)
	// If the vectorization width and interleaving count are both 1 then
	// consider the loop to have been already vectorized because there's
	// nothing more that we can do.
	IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
	DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
	<< "LV: Interleaving disabled by the pass manager\n");
	}

	/// Mark the loop L as already vectorized by setting the width to 1.
	void setAlreadyVectorized() {
	IsVectorized.Value = 1;
	Hint Hints[] = {IsVectorized};
	writeHintsToMetadata(Hints);
	}

	bool allowVectorization(Function F, Loop L, bool AlwaysVectorize) const {
	if (getForce() == LoopVectorizeHints::FK_Disabled) {
	DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
	emitRemarkWithHints();
	return false;
	}

	if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
	DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
	emitRemarkWithHints();
	return false;
	}

	if (getIsVectorized() == 1) {
	DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
	// FIXME: Add interleave.disable metadata. This will allow
	// vectorize.disable to be used without disabling the pass and errors
	// to differentiate between disabled vectorization and a width of 1.
	ORE.emit([&]() {
	return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
	"AllDisabled", L->getStartLoc(),
	L->getHeader())
	<< "loop not vectorized: vectorization and interleaving are "
	"explicitly disabled, or the loop has already been "
	"vectorized";
	});
	return false;
	}

	return true;
	}

	/// Dumps all the hint information.
	void emitRemarkWithHints() const {
	using namespace ore;

	ORE.emit([&]() {
	if (Force.Value == LoopVectorizeHints::FK_Disabled)
	return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
	TheLoop->getStartLoc(),
	TheLoop->getHeader())
	<< "loop not vectorized: vectorization is explicitly disabled";
	else {
	OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
	TheLoop->getStartLoc(),
	TheLoop->getHeader());
	R << "loop not vectorized";
	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
	R << " (Force=" << NV("Force", true);
	if (Width.Value != 0)
	R << ", Vector Width=" << NV("VectorWidth", Width.Value);
	if (Interleave.Value != 0)
	R << ", Interleave Count="
	<< NV("InterleaveCount", Interleave.Value);
	R << ")";
	}
	return R;
	}
	});
	}

	unsigned getWidth() const { return Width.Value; }
	unsigned getInterleave() const { return Interleave.Value; }
	unsigned getIsVectorized() const { return IsVectorized.Value; }
	enum ForceKind getForce() const { return (ForceKind)Force.Value; }

	/// \brief If hints are provided that force vectorization, use the AlwaysPrint
	/// pass name to force the frontend to print the diagnostic.
	const char *vectorizeAnalysisPassName() const {
	if (getWidth() == 1)
	return LV_NAME;
	if (getForce() == LoopVectorizeHints::FK_Disabled)
	return LV_NAME;
	if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
	return LV_NAME;
	return OptimizationRemarkAnalysis::AlwaysPrint;
	}

	bool allowReordering() const {
	// When enabling loop hints are provided we allow the vectorizer to change
	// the order of operations that is given by the scalar loop. This is not
	// enabled by default because can be unsafe or inefficient. For example,
	// reordering floating-point operations will change the way round-off
	// error accumulates in the loop.
	return getForce() == LoopVectorizeHints::FK_Enabled \|\| getWidth() > 1;
	}

	bool isPotentiallyUnsafe() const {
	// Avoid FP vectorization if the target is unsure about proper support.
	// This may be related to the SIMD unit in the target not handling
	// IEEE 754 FP ops properly, or bad single-to-double promotions.
	// Otherwise, a sequence of vectorized loops, even without reduction,
	// could lead to different end results on the destination vectors.
	return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
	}

	void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }

	private:
	/// Find hints specified in the loop metadata and update local values.
	void getHintsFromMetadata() {
	MDNode *LoopID = TheLoop->getLoopID();
	if (!LoopID)
	return;

	// First operand should refer to the loop id itself.
	assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
	assert(LoopID->getOperand(0) == LoopID && "invalid loop id");

	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
	const MDString *S = nullptr;
	SmallVector<Metadata *, 4> Args;

	// The expected hint is either a MDString or a MDNode with the first
	// operand a MDString.
	if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
	if (!MD \|\| MD->getNumOperands() == 0)
	continue;
	S = dyn_cast<MDString>(MD->getOperand(0));
	for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
	Args.push_back(MD->getOperand(i));
	} else {
	S = dyn_cast<MDString>(LoopID->getOperand(i));
	assert(Args.size() == 0 && "too many arguments for MDString");
	}

	if (!S)
	continue;

	// Check if the hint starts with the loop metadata prefix.
	StringRef Name = S->getString();
	if (Args.size() == 1)
	setHint(Name, Args[0]);
	}
	}

	/// Checks string hint with one operand and set value if valid.
	void setHint(StringRef Name, Metadata *Arg) {
	if (!Name.startswith(Prefix()))
	return;
	Name = Name.substr(Prefix().size(), StringRef::npos);

	const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
	if (!C)
	return;
	unsigned Val = C->getZExtValue();

	Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
	for (auto H : Hints) {
	if (Name == H->Name) {
	if (H->validate(Val))
	H->Value = Val;
	else
	DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
	break;
	}
	}
	}

	/// Create a new hint from name / value pair.
	MDNode *createHintMetadata(StringRef Name, unsigned V) const {
	LLVMContext &Context = TheLoop->getHeader()->getContext();
	Metadata *MDs[] = {MDString::get(Context, Name),
	ConstantAsMetadata::get(
	ConstantInt::get(Type::getInt32Ty(Context), V))};
	return MDNode::get(Context, MDs);
	}

	/// Matches metadata with hint name.
	bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
	MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
	if (!Name)
	return false;

	for (auto H : HintTypes)
	if (Name->getString().endswith(H.Name))
	return true;
	return false;
	}

	/// Sets current hints into loop metadata, keeping other values intact.
	void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
	if (HintTypes.empty())
	return;

	// Reserve the first element to LoopID (see below).
	SmallVector<Metadata *, 4> MDs(1);
	// If the loop already has metadata, then ignore the existing operands.
	MDNode *LoopID = TheLoop->getLoopID();
	if (LoopID) {
	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
	MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
	// If node in update list, ignore old value.
	if (!matchesHintMetadataName(Node, HintTypes))
	MDs.push_back(Node);
	}
	}

	// Now, add the missing hints.
	for (auto H : HintTypes)
	MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));

	// Replace current metadata node with new one.
	LLVMContext &Context = TheLoop->getHeader()->getContext();
	MDNode *NewLoopID = MDNode::get(Context, MDs);
	// Set operand 0 to refer to the loop id itself.
	NewLoopID->replaceOperandWith(0, NewLoopID);

	TheLoop->setLoopID(NewLoopID);
	}

	/// The loop these hints belong to.
	const Loop *TheLoop;

	/// Interface to emit optimization remarks.
	OptimizationRemarkEmitter &ORE;
	};

	} // end anonymous namespace

	static void emitMissedWarning(Function F, Loop L,
	const LoopVectorizeHints &LH,
	OptimizationRemarkEmitter *ORE) {
	LH.emitRemarkWithHints();

	if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
	if (LH.getWidth() != 1)
	ORE->emit(DiagnosticInfoOptimizationFailure(
	DEBUG_TYPE, "FailedRequestedVectorization",
	L->getStartLoc(), L->getHeader())
	<< "loop not vectorized: "
	<< "failed explicitly specified loop vectorization");
	else if (LH.getInterleave() != 1)
	ORE->emit(DiagnosticInfoOptimizationFailure(
	DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
	L->getHeader())
	<< "loop not interleaved: "
	<< "failed explicitly specified loop interleaving");
	}
	}

	namespace {

	/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
	/// to what vectorization factor.
	/// This class does not look at the profitability of vectorization, only the
	/// legality. This class has two main kinds of checks:
	/// * Memory checks - The code in canVectorizeMemory checks if vectorization
	/// will change the order of memory accesses in a way that will change the
	/// correctness of the program.
	/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
	/// checks for a number of different conditions, such as the availability of a
	/// single induction variable, that all types are supported and vectorize-able,
	/// etc. This code reflects the capabilities of InnerLoopVectorizer.
	/// This class is also used by InnerLoopVectorizer for identifying
	/// induction variable and the different reduction variables.
	class LoopVectorizationLegality {
	public:
	LoopVectorizationLegality(
	Loop L, PredicatedScalarEvolution &PSE, DominatorTree DT,
	TargetLibraryInfo TLI, AliasAnalysis AA, Function *F,
	const TargetTransformInfo *TTI,
	std::function<const LoopAccessInfo &(Loop &)> GetLAA, LoopInfo LI,
	OptimizationRemarkEmitter ORE, LoopVectorizationRequirements R,
	LoopVectorizeHints *H)
	: TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
	ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}

	/// ReductionList contains the reduction descriptors for all
	/// of the reductions that were found in the loop.
	using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;

	/// InductionList saves induction variables and maps them to the
	/// induction descriptor.
	using InductionList = MapVector<PHINode *, InductionDescriptor>;

	/// RecurrenceSet contains the phi nodes that are recurrences other than
	/// inductions and reductions.
	using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;

	/// Returns true if it is legal to vectorize this loop.
	/// This does not mean that it is profitable to vectorize this
	/// loop, only that it is legal to do so.
	bool canVectorize();

	/// Returns the primary induction variable.
	PHINode *getPrimaryInduction() { return PrimaryInduction; }

	/// Returns the reduction variables found in the loop.
	ReductionList *getReductionVars() { return &Reductions; }

	/// Returns the induction variables found in the loop.
	InductionList *getInductionVars() { return &Inductions; }

	/// Return the first-order recurrences found in the loop.
	RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }

	/// Return the set of instructions to sink to handle first-order recurrences.
	DenseMap<Instruction , Instruction > &getSinkAfter() { return SinkAfter; }

	/// Returns the widest induction type.
	Type *getWidestInductionType() { return WidestIndTy; }

	/// Returns True if V is a Phi node of an induction variable in this loop.
	bool isInductionPhi(const Value *V);

	/// Returns True if V is a cast that is part of an induction def-use chain,
	/// and had been proven to be redundant under a runtime guard (in other
	/// words, the cast has the same SCEV expression as the induction phi).
	bool isCastedInductionVariable(const Value *V);

	/// Returns True if V can be considered as an induction variable in this
	/// loop. V can be the induction phi, or some redundant cast in the def-use
	/// chain of the inducion phi.
	bool isInductionVariable(const Value *V);

	/// Returns True if PN is a reduction variable in this loop.
	bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }

	/// Returns True if Phi is a first-order recurrence in this loop.
	bool isFirstOrderRecurrence(const PHINode *Phi);

	/// Return true if the block BB needs to be predicated in order for the loop
	/// to be vectorized.
	bool blockNeedsPredication(BasicBlock *BB);

	/// Check if this pointer is consecutive when vectorizing. This happens
	/// when the last index of the GEP is the induction variable, or that the
	/// pointer itself is an induction variable.
	/// This check allows us to vectorize A[idx] into a wide load/store.
	/// Returns:
	/// 0 - Stride is unknown or non-consecutive.
	/// 1 - Address is consecutive.
	/// -1 - Address is consecutive, and decreasing.
	/// NOTE: This method must only be used before modifying the original scalar
	/// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
	int isConsecutivePtr(Value *Ptr);

	/// Returns true if the value V is uniform within the loop.
	bool isUniform(Value *V);

	/// Returns the information that we collected about runtime memory check.
	const RuntimePointerChecking *getRuntimePointerChecking() const {
	return LAI->getRuntimePointerChecking();
	}

	const LoopAccessInfo *getLAI() const { return LAI; }

	/// \brief Check if \p Instr belongs to any interleaved access group.
	bool isAccessInterleaved(Instruction *Instr) {
	return InterleaveInfo.isInterleaved(Instr);
	}

	/// \brief Get the interleaved access group that \p Instr belongs to.
	const InterleaveGroup getInterleavedAccessGroup(Instruction Instr) {
	return InterleaveInfo.getInterleaveGroup(Instr);
	}

	/// \brief Returns true if an interleaved group requires a scalar iteration
	/// to handle accesses with gaps.
	bool requiresScalarEpilogue() const {
	return InterleaveInfo.requiresScalarEpilogue();
	}

	unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }

	uint64_t getMaxSafeRegisterWidth() const {
	return LAI->getDepChecker().getMaxSafeRegisterWidth();
	}

	bool hasStride(Value *V) { return LAI->hasStride(V); }

	/// Returns true if the target machine supports masked store operation
	/// for the given \p DataType and kind of access to \p Ptr.
	bool isLegalMaskedStore(Type DataType, Value Ptr) {
	return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
	}

	/// Returns true if the target machine supports masked load operation
	/// for the given \p DataType and kind of access to \p Ptr.
	bool isLegalMaskedLoad(Type DataType, Value Ptr) {
	return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
	}

	/// Returns true if the target machine supports masked scatter operation
	/// for the given \p DataType.
	bool isLegalMaskedScatter(Type *DataType) {
	return TTI->isLegalMaskedScatter(DataType);
	}

	/// Returns true if the target machine supports masked gather operation
	/// for the given \p DataType.
	bool isLegalMaskedGather(Type *DataType) {
	return TTI->isLegalMaskedGather(DataType);
	}

	/// Returns true if the target machine can represent \p V as a masked gather
	/// or scatter operation.
	bool isLegalGatherOrScatter(Value *V) {
	auto *LI = dyn_cast<LoadInst>(V);
	auto *SI = dyn_cast<StoreInst>(V);
	if (!LI && !SI)
	return false;
	auto *Ptr = getPointerOperand(V);
	auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
	return (LI && isLegalMaskedGather(Ty)) \|\| (SI && isLegalMaskedScatter(Ty));
	}

	/// Returns true if vector representation of the instruction \p I
	/// requires mask.
	bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }

	unsigned getNumStores() const { return LAI->getNumStores(); }
	unsigned getNumLoads() const { return LAI->getNumLoads(); }
	unsigned getNumPredStores() const { return NumPredStores; }

	/// Returns true if \p I is an instruction that will be scalarized with
	/// predication. Such instructions include conditional stores and
	/// instructions that may divide by zero.
	bool isScalarWithPredication(Instruction *I);

	/// Returns true if \p I is a memory instruction with consecutive memory
	/// access that can be widened.
	bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);

	// Returns true if the NoNaN attribute is set on the function.
	bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }

	private:
	/// Check if a single basic block loop is vectorizable.
	/// At this point we know that this is a loop with a constant trip count
	/// and we only need to check individual instructions.
	bool canVectorizeInstrs();

	/// When we vectorize loops we may change the order in which
	/// we read and write from memory. This method checks if it is
	/// legal to vectorize the code, considering only memory constrains.
	/// Returns true if the loop is vectorizable
	bool canVectorizeMemory();

	/// Return true if we can vectorize this loop using the IF-conversion
	/// transformation.
	bool canVectorizeWithIfConvert();

	/// Return true if all of the instructions in the block can be speculatively
	/// executed. \p SafePtrs is a list of addresses that are known to be legal
	/// and we know that we can read from them without segfault.
	bool blockCanBePredicated(BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs);

	/// Updates the vectorization state by adding \p Phi to the inductions list.
	/// This can set \p Phi as the main induction of the loop if \p Phi is a
	/// better choice for the main induction than the existing one.
	void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
	SmallPtrSetImpl<Value *> &AllowedExit);

	/// Create an analysis remark that explains why vectorization failed
	///
	/// \p RemarkName is the identifier for the remark. If \p I is passed it is
	/// an instruction that prevents vectorization. Otherwise the loop is used
	/// for the location of the remark. \return the remark object that can be
	/// streamed to.
	OptimizationRemarkAnalysis
	createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
	return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
	RemarkName, TheLoop, I);
	}

	/// \brief If an access has a symbolic strides, this maps the pointer value to
	/// the stride symbol.
	const ValueToValueMap *getSymbolicStrides() {
	// FIXME: Currently, the set of symbolic strides is sometimes queried before
	// it's collected. This happens from canVectorizeWithIfConvert, when the
	// pointer is checked to reference consecutive elements suitable for a
	// masked access.
	return LAI ? &LAI->getSymbolicStrides() : nullptr;
	}

	unsigned NumPredStores = 0;

	/// The loop that we evaluate.
	Loop *TheLoop;

	/// A wrapper around ScalarEvolution used to add runtime SCEV checks.
	/// Applies dynamic knowledge to simplify SCEV expressions in the context
	/// of existing SCEV assumptions. The analysis will also add a minimal set
	/// of new predicates if this is required to enable vectorization and
	/// unrolling.
	PredicatedScalarEvolution &PSE;

	/// Target Library Info.
	TargetLibraryInfo *TLI;

	/// Target Transform Info
	const TargetTransformInfo *TTI;

	/// Dominator Tree.
	DominatorTree *DT;

	// LoopAccess analysis.
	std::function<const LoopAccessInfo &(Loop &)> *GetLAA;

	// And the loop-accesses info corresponding to this loop. This pointer is
	// null until canVectorizeMemory sets it up.
	const LoopAccessInfo *LAI = nullptr;

	/// Interface to emit optimization remarks.
	OptimizationRemarkEmitter *ORE;

	/// The interleave access information contains groups of interleaved accesses
	/// with the same stride and close to each other.
	InterleavedAccessInfo InterleaveInfo;

	// --- vectorization state --- //

	/// Holds the primary induction variable. This is the counter of the
	/// loop.
	PHINode *PrimaryInduction = nullptr;

	/// Holds the reduction variables.
	ReductionList Reductions;

	/// Holds all of the induction variables that we found in the loop.
	/// Notice that inductions don't need to start at zero and that induction
	/// variables can be pointers.
	InductionList Inductions;

	/// Holds all the casts that participate in the update chain of the induction
	/// variables, and that have been proven to be redundant (possibly under a
	/// runtime guard). These casts can be ignored when creating the vectorized
	/// loop body.
	SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;

	/// Holds the phi nodes that are first-order recurrences.
	RecurrenceSet FirstOrderRecurrences;

	/// Holds instructions that need to sink past other instructions to handle
	/// first-order recurrences.
	DenseMap<Instruction , Instruction > SinkAfter;

	/// Holds the widest induction type encountered.
	Type *WidestIndTy = nullptr;

	/// Allowed outside users. This holds the induction and reduction
	/// vars which can be accessed from outside the loop.
	SmallPtrSet<Value *, 4> AllowedExit;

	/// Can we assume the absence of NaNs.
	bool HasFunNoNaNAttr = false;

	/// Vectorization requirements that will go through late-evaluation.
	LoopVectorizationRequirements *Requirements;

	/// Used to emit an analysis of any legality issues.
	LoopVectorizeHints *Hints;

	/// While vectorizing these instructions we have to generate a
	/// call to the appropriate masked intrinsic
	SmallPtrSet<const Instruction *, 8> MaskedOp;
	};

	/// LoopVectorizationCostModel - estimates the expected speedups due to
	/// vectorization.
	/// In many cases vectorization is not profitable. This can happen because of
	/// a number of reasons. In this class we mainly attempt to predict the
	/// expected speedup/slowdowns due to the supported instruction set. We use the
	/// TargetTransformInfo to query the different backends for the cost of
	/// different operations.
	class LoopVectorizationCostModel {
	public:
	LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
	LoopInfo LI, LoopVectorizationLegality Legal,
	const TargetTransformInfo &TTI,
	const TargetLibraryInfo TLI, DemandedBits DB,
	AssumptionCache *AC,
	OptimizationRemarkEmitter ORE, const Function F,
	const LoopVectorizeHints *Hints)
	: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
	AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}

	/// \return An upper bound for the vectorization factor, or None if
	/// vectorization should be avoided up front.
	Optional<unsigned> computeMaxVF(bool OptForSize);

	/// Information about vectorization costs
	struct VectorizationFactor {
	// Vector width with best cost
	unsigned Width;

	// Cost of the loop with that width
	unsigned Cost;
	};

	/// \return The most profitable vectorization factor and the cost of that VF.
	/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
	/// then this vectorization factor will be selected if vectorization is
	/// possible.
	VectorizationFactor selectVectorizationFactor(unsigned MaxVF);

	/// Setup cost-based decisions for user vectorization factor.
	void selectUserVectorizationFactor(unsigned UserVF) {
	collectUniformsAndScalars(UserVF);
	collectInstsToScalarize(UserVF);
	}

	/// \return The size (in bits) of the smallest and widest types in the code
	/// that needs to be vectorized. We ignore values that remain scalar such as
	/// 64 bit loop indices.
	std::pair<unsigned, unsigned> getSmallestAndWidestTypes();

	/// \return The desired interleave count.
	/// If interleave count has been specified by metadata it will be returned.
	/// Otherwise, the interleave count is computed and returned. VF and LoopCost
	/// are the selected vectorization factor and the cost of the selected VF.
	unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
	unsigned LoopCost);

	/// Memory access instruction may be vectorized in more than one way.
	/// Form of instruction after vectorization depends on cost.
	/// This function takes cost-based decisions for Load/Store instructions
	/// and collects them in a map. This decisions map is used for building
	/// the lists of loop-uniform and loop-scalar instructions.
	/// The calculated cost is saved with widening decision in order to
	/// avoid redundant calculations.
	void setCostBasedWideningDecision(unsigned VF);

	/// \brief A struct that represents some properties of the register usage
	/// of a loop.
	struct RegisterUsage {
	/// Holds the number of loop invariant values that are used in the loop.
	unsigned LoopInvariantRegs;

	/// Holds the maximum number of concurrent live intervals in the loop.
	unsigned MaxLocalUsers;

	/// Holds the number of instructions in the loop.
	unsigned NumInstructions;
	};

	/// \return Returns information about the register usages of the loop for the
	/// given vectorization factors.
	SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);

	/// Collect values we want to ignore in the cost model.
	void collectValuesToIgnore();

	/// \returns The smallest bitwidth each instruction can be represented with.
	/// The vector equivalents of these instructions should be truncated to this
	/// type.
	const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
	return MinBWs;
	}

	/// \returns True if it is more profitable to scalarize instruction \p I for
	/// vectorization factor \p VF.
	bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
	assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
	auto Scalars = InstsToScalarize.find(VF);
	assert(Scalars != InstsToScalarize.end() &&
	"VF not yet analyzed for scalarization profitability");
	return Scalars->second.count(I);
	}

	/// Returns true if \p I is known to be uniform after vectorization.
	bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
	if (VF == 1)
	return true;
	assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
	auto UniformsPerVF = Uniforms.find(VF);
	return UniformsPerVF->second.count(I);
	}

	/// Returns true if \p I is known to be scalar after vectorization.
	bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
	if (VF == 1)
	return true;
	assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
	auto ScalarsPerVF = Scalars.find(VF);
	return ScalarsPerVF->second.count(I);
	}

	/// \returns True if instruction \p I can be truncated to a smaller bitwidth
	/// for vectorization factor \p VF.
	bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
	return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
	!isScalarAfterVectorization(I, VF);
	}

	/// Decision that was taken during cost calculation for memory instruction.
	enum InstWidening {
	CM_Unknown,
	CM_Widen, // For consecutive accesses with stride +1.
	CM_Widen_Reverse, // For consecutive accesses with stride -1.
	CM_Interleave,
	CM_GatherScatter,
	CM_Scalarize
	};

	/// Save vectorization decision \p W and \p Cost taken by the cost model for
	/// instruction \p I and vector width \p VF.
	void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
	unsigned Cost) {
	assert(VF >= 2 && "Expected VF >=2");
	WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
	}

	/// Save vectorization decision \p W and \p Cost taken by the cost model for
	/// interleaving group \p Grp and vector width \p VF.
	void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
	InstWidening W, unsigned Cost) {
	assert(VF >= 2 && "Expected VF >=2");
	/// Broadcast this decicion to all instructions inside the group.
	/// But the cost will be assigned to one instruction only.
	for (unsigned i = 0; i < Grp->getFactor(); ++i) {
	if (auto *I = Grp->getMember(i)) {
	if (Grp->getInsertPos() == I)
	WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
	else
	WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
	}
	}
	}

	/// Return the cost model decision for the given instruction \p I and vector
	/// width \p VF. Return CM_Unknown if this instruction did not pass
	/// through the cost modeling.
	InstWidening getWideningDecision(Instruction *I, unsigned VF) {
	assert(VF >= 2 && "Expected VF >=2");
	std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
	auto Itr = WideningDecisions.find(InstOnVF);
	if (Itr == WideningDecisions.end())
	return CM_Unknown;
	return Itr->second.first;
	}

	/// Return the vectorization cost for the given instruction \p I and vector
	/// width \p VF.
	unsigned getWideningCost(Instruction *I, unsigned VF) {
	assert(VF >= 2 && "Expected VF >=2");
	std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
	assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
	return WideningDecisions[InstOnVF].second;
	}

	/// Return True if instruction \p I is an optimizable truncate whose operand
	/// is an induction variable. Such a truncate will be removed by adding a new
	/// induction variable with the destination type.
	bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
	// If the instruction is not a truncate, return false.
	auto *Trunc = dyn_cast<TruncInst>(I);
	if (!Trunc)
	return false;

	// Get the source and destination types of the truncate.
	Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
	Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);

	// If the truncate is free for the given types, return false. Replacing a
	// free truncate with an induction variable would add an induction variable
	// update instruction to each iteration of the loop. We exclude from this
	// check the primary induction variable since it will need an update
	// instruction regardless.
	Value *Op = Trunc->getOperand(0);
	if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
	return false;

	// If the truncated value is not an induction variable, return false.
	return Legal->isInductionPhi(Op);
	}

	/// Collects the instructions to scalarize for each predicated instruction in
	/// the loop.
	void collectInstsToScalarize(unsigned VF);

	/// Collect Uniform and Scalar values for the given \p VF.
	/// The sets depend on CM decision for Load/Store instructions
	/// that may be vectorized as interleave, gather-scatter or scalarized.
	void collectUniformsAndScalars(unsigned VF) {
	// Do the analysis once.
	if (VF == 1 \|\| Uniforms.count(VF))
	return;
	setCostBasedWideningDecision(VF);
	collectLoopUniforms(VF);
	collectLoopScalars(VF);
	}

	private:
	/// \return An upper bound for the vectorization factor, larger than zero.
	/// One is returned if vectorization should best be avoided due to cost.
	unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);

	/// The vectorization cost is a combination of the cost itself and a boolean
	/// indicating whether any of the contributing operations will actually
	/// operate on
	/// vector values after type legalization in the backend. If this latter value
	/// is
	/// false, then all operations will be scalarized (i.e. no vectorization has
	/// actually taken place).
	using VectorizationCostTy = std::pair<unsigned, bool>;

	/// Returns the expected execution cost. The unit of the cost does
	/// not matter because we use the 'cost' units to compare different
	/// vector widths. The cost that is returned is not normalized by
	/// the factor width.
	VectorizationCostTy expectedCost(unsigned VF);

	/// Returns the execution time cost of an instruction for a given vector
	/// width. Vector width of one means scalar.
	VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);

	/// The cost-computation logic from getInstructionCost which provides
	/// the vector type as an output parameter.
	unsigned getInstructionCost(Instruction I, unsigned VF, Type &VectorTy);

	/// Calculate vectorization cost of memory instruction \p I.
	unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);

	/// The cost computation for scalarized memory instruction.
	unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);

	/// The cost computation for interleaving group of memory instructions.
	unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);

	/// The cost computation for Gather/Scatter instruction.
	unsigned getGatherScatterCost(Instruction *I, unsigned VF);

	/// The cost computation for widening instruction \p I with consecutive
	/// memory access.
	unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);

	/// The cost calculation for Load instruction \p I with uniform pointer -
	/// scalar load + broadcast.
	unsigned getUniformMemOpCost(Instruction *I, unsigned VF);

	/// Returns whether the instruction is a load or store and will be a emitted
	/// as a vector operation.
	bool isConsecutiveLoadOrStore(Instruction *I);

	/// Create an analysis remark that explains why vectorization failed
	///
	/// \p RemarkName is the identifier for the remark. \return the remark object
	/// that can be streamed to.
	OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
	return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
	RemarkName, TheLoop);
	}

	/// Map of scalar integer values to the smallest bitwidth they can be legally
	/// represented as. The vector equivalents of these values should be truncated
	/// to this type.
	MapVector<Instruction *, uint64_t> MinBWs;

	/// A type representing the costs for instructions if they were to be
	/// scalarized rather than vectorized. The entries are Instruction-Cost
	/// pairs.
	using ScalarCostsTy = DenseMap<Instruction *, unsigned>;

	/// A set containing all BasicBlocks that are known to present after
	/// vectorization as a predicated block.
	SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

	/// A map holding scalar costs for different vectorization factors. The
	/// presence of a cost for an instruction in the mapping indicates that the
	/// instruction will be scalarized when vectorizing with the associated
	/// vectorization factor. The entries are VF-ScalarCostTy pairs.
	DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;

	/// Holds the instructions known to be uniform after vectorization.
	/// The data is collected per VF.
	DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;

	/// Holds the instructions known to be scalar after vectorization.
	/// The data is collected per VF.
	DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;

	/// Holds the instructions (address computations) that are forced to be
	/// scalarized.
	DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;

	/// Returns the expected difference in cost from scalarizing the expression
	/// feeding a predicated instruction \p PredInst. The instructions to
	/// scalarize and their scalar costs are collected in \p ScalarCosts. A
	/// non-negative return value implies the expression will be scalarized.
	/// Currently, only single-use chains are considered for scalarization.
	int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
	unsigned VF);

	/// Collect the instructions that are uniform after vectorization. An
	/// instruction is uniform if we represent it with a single scalar value in
	/// the vectorized loop corresponding to each vector iteration. Examples of
	/// uniform instructions include pointer operands of consecutive or
	/// interleaved memory accesses. Note that although uniformity implies an
	/// instruction will be scalar, the reverse is not true. In general, a
	/// scalarized instruction will be represented by VF scalar values in the
	/// vectorized loop, each corresponding to an iteration of the original
	/// scalar loop.
	void collectLoopUniforms(unsigned VF);

	/// Collect the instructions that are scalar after vectorization. An
	/// instruction is scalar if it is known to be uniform or will be scalarized
	/// during vectorization. Non-uniform scalarized instructions will be
	/// represented by VF values in the vectorized loop, each corresponding to an
	/// iteration of the original scalar loop.
	void collectLoopScalars(unsigned VF);

	/// Keeps cost model vectorization decision and cost for instructions.
	/// Right now it is used for memory instructions only.
	using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
	std::pair<InstWidening, unsigned>>;

	DecisionList WideningDecisions;

	public:
	/// The loop that we evaluate.
	Loop *TheLoop;

	/// Predicated scalar evolution analysis.
	PredicatedScalarEvolution &PSE;

	/// Loop Info analysis.
	LoopInfo *LI;

	/// Vectorization legality.
	LoopVectorizationLegality *Legal;

	/// Vector target information.
	const TargetTransformInfo &TTI;

	/// Target Library Info.
	const TargetLibraryInfo *TLI;

	/// Demanded bits analysis.
	DemandedBits *DB;

	/// Assumption cache.
	AssumptionCache *AC;

	/// Interface to emit optimization remarks.
	OptimizationRemarkEmitter *ORE;

	const Function *TheFunction;

	/// Loop Vectorize Hint.
	const LoopVectorizeHints *Hints;

	/// Values to ignore in the cost model.
	SmallPtrSet<const Value *, 16> ValuesToIgnore;

	/// Values to ignore in the cost model when VF > 1.
	SmallPtrSet<const Value *, 16> VecValuesToIgnore;
	};

	} // end anonymous namespace

	namespace llvm {

	/// InnerLoopVectorizer vectorizes loops which contain only one basic
	/// LoopVectorizationPlanner - drives the vectorization process after having
	/// passed Legality checks.
	/// The planner builds and optimizes the Vectorization Plans which record the
	/// decisions how to vectorize the given loop. In particular, represent the
	/// control-flow of the vectorized version, the replication of instructions that
	/// are to be scalarized, and interleave access groups.
	class LoopVectorizationPlanner {
	/// The loop that we evaluate.
	Loop *OrigLoop;

	/// Loop Info analysis.
	LoopInfo *LI;

	/// Target Library Info.
	const TargetLibraryInfo *TLI;

	/// Target Transform Info.
	const TargetTransformInfo *TTI;

	/// The legality analysis.
	LoopVectorizationLegality *Legal;

	/// The profitablity analysis.
	LoopVectorizationCostModel &CM;

	using VPlanPtr = std::unique_ptr<VPlan>;

	SmallVector<VPlanPtr, 4> VPlans;

	/// This class is used to enable the VPlan to invoke a method of ILV. This is
	/// needed until the method is refactored out of ILV and becomes reusable.
	struct VPCallbackILV : public VPCallback {
	InnerLoopVectorizer &ILV;

	VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}

	Value getOrCreateVectorValues(Value V, unsigned Part) override {
	return ILV.getOrCreateVectorValue(V, Part);
	}
	};

	/// A builder used to construct the current plan.
	VPBuilder Builder;

	/// When we if-convert we need to create edge masks. We have to cache values
	/// so that we don't end up with exponential recursion/IR. Note that
	/// if-conversion currently takes place during VPlan-construction, so these
	/// caches are only used at that stage.
	using EdgeMaskCacheTy =
	DenseMap<std::pair<BasicBlock , BasicBlock >, VPValue *>;
	using BlockMaskCacheTy = DenseMap<BasicBlock , VPValue >;
	EdgeMaskCacheTy EdgeMaskCache;
	BlockMaskCacheTy BlockMaskCache;

	unsigned BestVF = 0;
	unsigned BestUF = 0;

	public:
	LoopVectorizationPlanner(Loop L, LoopInfo LI, const TargetLibraryInfo *TLI,
	const TargetTransformInfo *TTI,
	LoopVectorizationLegality *Legal,
	LoopVectorizationCostModel &CM)
	: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}

	/// Plan how to best vectorize, return the best VF and its cost.
	LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
	unsigned UserVF);

	/// Finalize the best decision and dispose of all other VPlans.
	void setBestPlan(unsigned VF, unsigned UF);

	/// Generate the IR code for the body of the vectorized loop according to the
	/// best selected VPlan.
	void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);

	void printPlans(raw_ostream &O) {
	for (const auto &Plan : VPlans)
	O << *Plan;
	}

	protected:
	/// Collect the instructions from the original loop that would be trivially
	/// dead in the vectorized loop if generated.
	void collectTriviallyDeadInstructions(
	SmallPtrSetImpl<Instruction *> &DeadInstructions);

	/// A range of powers-of-2 vectorization factors with fixed start and
	/// adjustable end. The range includes start and excludes end, e.g.,:
	/// [1, 9) = {1, 2, 4, 8}
	struct VFRange {
	// A power of 2.
	const unsigned Start;

	// Need not be a power of 2. If End <= Start range is empty.
	unsigned End;
	};

	/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
	/// \p Predicate on Range.Start, possibly decreasing Range.End such that the
	/// returned value holds for the entire \p Range.
	bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
	VFRange &Range);

	/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
	/// according to the information gathered by Legal when it checked if it is
	/// legal to vectorize the loop.
	void buildVPlans(unsigned MinVF, unsigned MaxVF);

	private:
	/// A helper function that computes the predicate of the block BB, assuming
	/// that the header block of the loop is set to True. It returns the entry
	/// mask for the block BB.
	VPValue createBlockInMask(BasicBlock BB, VPlanPtr &Plan);

	/// A helper function that computes the predicate of the edge between SRC
	/// and DST.
	VPValue createEdgeMask(BasicBlock Src, BasicBlock *Dst, VPlanPtr &Plan);

	/// Check if \I belongs to an Interleave Group within the given VF \p Range,
	/// \return true in the first returned value if so and false otherwise.
	/// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
	/// for \p Range.Start, and provide it as the second returned value.
	/// Note that if \I is an adjunct member of an IG for \p Range.Start, the
	/// \return value is <true, nullptr>, as it is handled by another recipe.
	/// \p Range.End may be decreased to ensure same decision from \p Range.Start
	/// to \p Range.End.
	VPInterleaveRecipe tryToInterleaveMemory(Instruction I, VFRange &Range);

	// Check if \I is a memory instruction to be widened for \p Range.Start and
	// potentially masked. Such instructions are handled by a recipe that takes an
	// additional VPInstruction for the mask.
	VPWidenMemoryInstructionRecipe tryToWidenMemory(Instruction I,
	VFRange &Range,
	VPlanPtr &Plan);

	/// Check if an induction recipe should be constructed for \I within the given
	/// VF \p Range. If so build and return it. If not, return null. \p Range.End
	/// may be decreased to ensure same decision from \p Range.Start to
	/// \p Range.End.
	VPWidenIntOrFpInductionRecipe tryToOptimizeInduction(Instruction I,
	VFRange &Range);

	/// Handle non-loop phi nodes. Currently all such phi nodes are turned into
	/// a sequence of select instructions as the vectorizer currently performs
	/// full if-conversion.
	VPBlendRecipe tryToBlend(Instruction I, VPlanPtr &Plan);

	/// Check if \p I can be widened within the given VF \p Range. If \p I can be
	/// widened for \p Range.Start, check if the last recipe of \p VPBB can be
	/// extended to include \p I or else build a new VPWidenRecipe for it and
	/// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
	/// false otherwise. Range.End may be decreased to ensure same decision from
	/// \p Range.Start to \p Range.End.
	bool tryToWiden(Instruction I, VPBasicBlock VPBB, VFRange &Range);

	/// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
	/// is predicated. \return \p VPBB augmented with this new recipe if \p I is
	/// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
	/// Region. Update the packing decision of predicated instructions if they
	/// feed \p I. Range.End may be decreased to ensure same recipe behavior from
	/// \p Range.Start to \p Range.End.
	VPBasicBlock *handleReplication(
	Instruction I, VFRange &Range, VPBasicBlock VPBB,
	DenseMap<Instruction , VPReplicateRecipe > &PredInst2Recipe,
	VPlanPtr &Plan);

	/// Create a replicating region for instruction \p I that requires
	/// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
	VPRegionBlock createReplicateRegion(Instruction I, VPRecipeBase *PredRecipe,
	VPlanPtr &Plan);

	/// Build a VPlan according to the information gathered by Legal. \return a
	/// VPlan for vectorization factors \p Range.Start and up to \p Range.End
	/// exclusive, possibly decreasing \p Range.End.
	VPlanPtr buildVPlan(VFRange &Range,
	const SmallPtrSetImpl<Value *> &NeedDef);
	};

	} // end namespace llvm

	namespace {

	/// \brief This holds vectorization requirements that must be verified late in
	/// the process. The requirements are set by legalize and costmodel. Once
	/// vectorization has been determined to be possible and profitable the
	/// requirements can be verified by looking for metadata or compiler options.
	/// For example, some loops require FP commutativity which is only allowed if
	/// vectorization is explicitly specified or if the fast-math compiler option
	/// has been provided.
	/// Late evaluation of these requirements allows helpful diagnostics to be
	/// composed that tells the user what need to be done to vectorize the loop. For
	/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
	/// evaluation should be used only when diagnostics can generated that can be
	/// followed by a non-expert user.
	class LoopVectorizationRequirements {
	public:
	LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}

	void addUnsafeAlgebraInst(Instruction *I) {
	// First unsafe algebra instruction.
	if (!UnsafeAlgebraInst)
	UnsafeAlgebraInst = I;
	}

	void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }

	bool doesNotMeet(Function F, Loop L, const LoopVectorizeHints &Hints) {
	const char *PassName = Hints.vectorizeAnalysisPassName();
	bool Failed = false;
	if (UnsafeAlgebraInst && !Hints.allowReordering()) {
	ORE.emit([&]() {
	return OptimizationRemarkAnalysisFPCommute(
	PassName, "CantReorderFPOps",
	UnsafeAlgebraInst->getDebugLoc(),
	UnsafeAlgebraInst->getParent())
	<< "loop not vectorized: cannot prove it is safe to reorder "
	"floating-point operations";
	});
	Failed = true;
	}

	// Test if runtime memcheck thresholds are exceeded.
	bool PragmaThresholdReached =
	NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
	bool ThresholdReached =
	NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
	if ((ThresholdReached && !Hints.allowReordering()) \|\|
	PragmaThresholdReached) {
	ORE.emit([&]() {
	return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
	L->getStartLoc(),
	L->getHeader())
	<< "loop not vectorized: cannot prove it is safe to reorder "
	"memory operations";
	});
	DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
	Failed = true;
	}

	return Failed;
	}

	private:
	unsigned NumRuntimePointerChecks = 0;
	Instruction *UnsafeAlgebraInst = nullptr;

	/// Interface to emit optimization remarks.
	OptimizationRemarkEmitter &ORE;
	};

	} // end anonymous namespace

	static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
	if (L.empty()) {
	if (!hasCyclesInLoopBody(L))
	V.push_back(&L);
	return;
	}
	for (Loop *InnerL : L)
	addAcyclicInnerLoop(*InnerL, V);
	}

	namespace {

	/// The LoopVectorize Pass.
	struct LoopVectorize : public FunctionPass {
	/// Pass identification, replacement for typeid
	static char ID;

	LoopVectorizePass Impl;

	explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
	: FunctionPass(ID) {
	Impl.DisableUnrolling = NoUnrolling;
	Impl.AlwaysVectorize = AlwaysVectorize;
	initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;

	auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
	auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
	auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
	auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
	auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
	auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
	auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
	auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

	std::function<const LoopAccessInfo &(Loop &)> GetLAA =
	[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };

	return Impl.runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC,
	GetLAA, *ORE);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<BlockFrequencyInfoWrapperPass>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	AU.addRequired<ScalarEvolutionWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<LoopAccessLegacyAnalysis>();
	AU.addRequired<DemandedBitsWrapperPass>();
	AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
	AU.addPreserved<LoopInfoWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<BasicAAWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	}
	};

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
	// LoopVectorizationCostModel and LoopVectorizationPlanner.
	//===----------------------------------------------------------------------===//

	Value InnerLoopVectorizer::getBroadcastInstrs(Value V) {
	// We need to place the broadcast of invariant variables outside the loop.
	Instruction *Instr = dyn_cast<Instruction>(V);
	bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
	bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;

	// Place the code for broadcasting invariant variables in the new preheader.
	IRBuilder<>::InsertPointGuard Guard(Builder);
	if (Invariant)
	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

	// Broadcast the scalar into all locations in the vector.
	Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

	return Shuf;
	}

	void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
	const InductionDescriptor &II, Value Step, Instruction EntryVal) {
	Value *Start = II.getStartValue();

	// Construct the initial value of the vector IV in the vector loop preheader
	auto CurrIP = Builder.saveIP();
	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
	if (isa<TruncInst>(EntryVal)) {
	assert(Start->getType()->isIntegerTy() &&
	"Truncation requires an integer type");
	auto *TruncType = cast<IntegerType>(EntryVal->getType());
	Step = Builder.CreateTrunc(Step, TruncType);
	Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
	}
	Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
	Value *SteppedStart =
	getStepVector(SplatStart, 0, Step, II.getInductionOpcode());

	// We create vector phi nodes for both integer and floating-point induction
	// variables. Here, we determine the kind of arithmetic we will perform.
	Instruction::BinaryOps AddOp;
	Instruction::BinaryOps MulOp;
	if (Step->getType()->isIntegerTy()) {
	AddOp = Instruction::Add;
	MulOp = Instruction::Mul;
	} else {
	AddOp = II.getInductionOpcode();
	MulOp = Instruction::FMul;
	}

	// Multiply the vectorization factor by the step using integer or
	// floating-point arithmetic as appropriate.
	Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
	Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));

	// Create a vector splat to use in the induction update.
	//
	// FIXME: If the step is non-constant, we create the vector splat with
	// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
	// handle a constant vector splat.
	Value *SplatVF = isa<Constant>(Mul)
	? ConstantVector::getSplat(VF, cast<Constant>(Mul))
	: Builder.CreateVectorSplat(VF, Mul);
	Builder.restoreIP(CurrIP);

	// We may need to add the step a number of times, depending on the unroll
	// factor. The last of those goes into the PHI.
	PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
	&*LoopVectorBody->getFirstInsertionPt());
	Instruction *LastInduction = VecInd;
	for (unsigned Part = 0; Part < UF; ++Part) {
	VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
	- recordVectorLoopValueForInductionCast(II, LastInduction, Part);
	+
	if (isa<TruncInst>(EntryVal))
	addMetadata(LastInduction, EntryVal);
	+ else
	+ recordVectorLoopValueForInductionCast(II, LastInduction, Part);
	+
	LastInduction = cast<Instruction>(addFastMathFlag(
	Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
	}

	// Move the last step to the end of the latch block. This ensures consistent
	// placement of all induction updates.
	auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
	auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
	auto *ICmp = cast<Instruction>(Br->getCondition());
	LastInduction->moveBefore(ICmp);
	LastInduction->setName("vec.ind.next");

	VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
	VecInd->addIncoming(LastInduction, LoopVectorLatch);
	}

	bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
	return Cost->isScalarAfterVectorization(I, VF) \|\|
	Cost->isProfitableToScalarize(I, VF);
	}

	bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
	if (shouldScalarizeInstruction(IV))
	return true;
	auto isScalarInst = [&](User *U) -> bool {
	auto *I = cast<Instruction>(U);
	return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
	};
	return llvm::any_of(IV->users(), isScalarInst);
	}

	void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
	const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
	unsigned Lane) {
	const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
	if (Casts.empty())
	return;
	// Only the first Cast instruction in the Casts vector is of interest.
	// The rest of the Casts (if exist) have no uses outside the
	// induction update chain itself.
	Instruction CastInst = Casts.begin();
	if (Lane < UINT_MAX)
	VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
	else
	VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
	}

	void InnerLoopVectorizer::widenIntOrFpInduction(PHINode IV, TruncInst Trunc) {
	assert((IV->getType()->isIntegerTy() \|\| IV != OldInduction) &&
	"Primary induction variable must have an integer type");

	auto II = Legal->getInductionVars()->find(IV);
	assert(II != Legal->getInductionVars()->end() && "IV is not an induction");

	auto ID = II->second;
	assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");

	// The scalar value to broadcast. This will be derived from the canonical
	// induction variable.
	Value *ScalarIV = nullptr;

	// The value from the original loop to which we are mapping the new induction
	// variable.
	Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

	// True if we have vectorized the induction variable.
	auto VectorizedIV = false;

	// Determine if we want a scalar version of the induction variable. This is
	// true if the induction variable itself is not widened, or if it has at
	// least one user in the loop that is not widened.
	auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);

	// Generate code for the induction step. Note that induction steps are
	// required to be loop-invariant
	assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
	"Induction step should be loop invariant");
	auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
	Value *Step = nullptr;
	if (PSE.getSE()->isSCEVable(IV->getType())) {
	SCEVExpander Exp(*PSE.getSE(), DL, "induction");
	Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
	LoopVectorPreHeader->getTerminator());
	} else {
	Step = cast<SCEVUnknown>(ID.getStep())->getValue();
	}

	// Try to create a new independent vector induction variable. If we can't
	// create the phi node, we will splat the scalar induction variable in each
	// loop iteration.
	if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
	createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
	VectorizedIV = true;
	}

	// If we haven't yet vectorized the induction variable, or if we will create
	// a scalar one, we need to define the scalar induction variable and step
	// values. If we were given a truncation type, truncate the canonical
	// induction variable and step. Otherwise, derive these values from the
	// induction descriptor.
	if (!VectorizedIV \|\| NeedsScalarIV) {
	ScalarIV = Induction;
	if (IV != OldInduction) {
	ScalarIV = IV->getType()->isIntegerTy()
	? Builder.CreateSExtOrTrunc(Induction, IV->getType())
	: Builder.CreateCast(Instruction::SIToFP, Induction,
	IV->getType());
	ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
	ScalarIV->setName("offset.idx");
	}
	if (Trunc) {
	auto *TruncType = cast<IntegerType>(Trunc->getType());
	assert(Step->getType()->isIntegerTy() &&
	"Truncation requires an integer step");
	ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
	Step = Builder.CreateTrunc(Step, TruncType);
	}
	}

	// If we haven't yet vectorized the induction variable, splat the scalar
	// induction variable, and build the necessary step vectors.
	+ // TODO: Don't do it unless the vectorized IV is really required.
	if (!VectorizedIV) {
	Value *Broadcasted = getBroadcastInstrs(ScalarIV);
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *EntryPart =
	getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
	VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
	- recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
	if (Trunc)
	addMetadata(EntryPart, Trunc);
	+ else
	+ recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
	}
	}

	// If an induction variable is only used for counting loop iterations or
	// calculating addresses, it doesn't need to be widened. Create scalar steps
	// that can be used by instructions we will later scalarize. Note that the
	// addition of the scalar steps will not increase the number of instructions
	// in the loop in the common case prior to InstCombine. We will be trading
	// one vector extract for each scalar step.
	if (NeedsScalarIV)
	buildScalarSteps(ScalarIV, Step, EntryVal, ID);
	}

	Value InnerLoopVectorizer::getStepVector(Value Val, int StartIdx, Value *Step,
	Instruction::BinaryOps BinOp) {
	// Create and check the types.
	assert(Val->getType()->isVectorTy() && "Must be a vector");
	int VLen = Val->getType()->getVectorNumElements();

	Type *STy = Val->getType()->getScalarType();
	assert((STy->isIntegerTy() \|\| STy->isFloatingPointTy()) &&
	"Induction Step must be an integer or FP");
	assert(Step->getType() == STy && "Step has wrong type");

	SmallVector<Constant *, 8> Indices;

	if (STy->isIntegerTy()) {
	// Create a vector of consecutive numbers from zero to VF.
	for (int i = 0; i < VLen; ++i)
	Indices.push_back(ConstantInt::get(STy, StartIdx + i));

	// Add the consecutive indices to the vector value.
	Constant *Cv = ConstantVector::get(Indices);
	assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
	Step = Builder.CreateVectorSplat(VLen, Step);
	assert(Step->getType() == Val->getType() && "Invalid step vec");
	// FIXME: The newly created binary instructions should contain nsw/nuw flags,
	// which can be found from the original scalar operations.
	Step = Builder.CreateMul(Cv, Step);
	return Builder.CreateAdd(Val, Step, "induction");
	}

	// Floating point induction.
	assert((BinOp == Instruction::FAdd \|\| BinOp == Instruction::FSub) &&
	"Binary Opcode should be specified for FP induction");
	// Create a vector of consecutive numbers from zero to VF.
	for (int i = 0; i < VLen; ++i)
	Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));

	// Add the consecutive indices to the vector value.
	Constant *Cv = ConstantVector::get(Indices);

	Step = Builder.CreateVectorSplat(VLen, Step);

	// Floating point operations had to be 'fast' to enable the induction.
	FastMathFlags Flags;
	Flags.setFast();

	Value *MulOp = Builder.CreateFMul(Cv, Step);
	if (isa<Instruction>(MulOp))
	// Have to check, MulOp may be a constant
	cast<Instruction>(MulOp)->setFastMathFlags(Flags);

	Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
	if (isa<Instruction>(BOp))
	cast<Instruction>(BOp)->setFastMathFlags(Flags);
	return BOp;
	}

	void InnerLoopVectorizer::buildScalarSteps(Value ScalarIV, Value Step,
	Value *EntryVal,
	const InductionDescriptor &ID) {
	// We shouldn't have to build scalar steps if we aren't vectorizing.
	assert(VF > 1 && "VF should be greater than one");

	// Get the value type and ensure it and the step have the same integer type.
	Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
	assert(ScalarIVTy == Step->getType() &&
	"Val and Step should have the same type");

	// We build scalar steps for both integer and floating-point induction
	// variables. Here, we determine the kind of arithmetic we will perform.
	Instruction::BinaryOps AddOp;
	Instruction::BinaryOps MulOp;
	if (ScalarIVTy->isIntegerTy()) {
	AddOp = Instruction::Add;
	MulOp = Instruction::Mul;
	} else {
	AddOp = ID.getInductionOpcode();
	MulOp = Instruction::FMul;
	}

	// Determine the number of scalars we need to generate for each unroll
	// iteration. If EntryVal is uniform, we only need to generate the first
	// lane. Otherwise, we generate all VF values.
	unsigned Lanes =
	Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
	: VF;
	// Compute the scalar steps and save the results in VectorLoopValueMap.
	for (unsigned Part = 0; Part < UF; ++Part) {
	for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
	auto StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF Part + Lane);
	auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
	auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
	VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
	recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
	}
	}
	}

	int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
	const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
	ValueToValueMap();

	int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
	if (Stride == 1 \|\| Stride == -1)
	return Stride;
	return 0;
	}

	bool LoopVectorizationLegality::isUniform(Value *V) {
	return LAI->isUniform(V);
	}

	Value InnerLoopVectorizer::getOrCreateVectorValue(Value V, unsigned Part) {
	assert(V != Induction && "The new induction variable should not be used.");
	assert(!V->getType()->isVectorTy() && "Can't widen a vector");
	assert(!V->getType()->isVoidTy() && "Type does not produce a value");

	// If we have a stride that is replaced by one, do it here.
	if (Legal->hasStride(V))
	V = ConstantInt::get(V->getType(), 1);

	// If we have a vector mapped to this value, return it.
	if (VectorLoopValueMap.hasVectorValue(V, Part))
	return VectorLoopValueMap.getVectorValue(V, Part);

	// If the value has not been vectorized, check if it has been scalarized
	// instead. If it has been scalarized, and we actually need the value in
	// vector form, we will construct the vector values on demand.
	if (VectorLoopValueMap.hasAnyScalarValue(V)) {
	Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});

	// If we've scalarized a value, that value should be an instruction.
	auto *I = cast<Instruction>(V);

	// If we aren't vectorizing, we can just copy the scalar map values over to
	// the vector map.
	if (VF == 1) {
	VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
	return ScalarValue;
	}

	// Get the last scalar instruction we generated for V and Part. If the value
	// is known to be uniform after vectorization, this corresponds to lane zero
	// of the Part unroll iteration. Otherwise, the last instruction is the one
	// we created for the last vector lane of the Part unroll iteration.
	unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
	auto *LastInst = cast<Instruction>(
	VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));

	// Set the insert point after the last scalarized instruction. This ensures
	// the insertelement sequence will directly follow the scalar definitions.
	auto OldIP = Builder.saveIP();
	auto NewIP = std::next(BasicBlock::iterator(LastInst));
	Builder.SetInsertPoint(&*NewIP);

	// However, if we are vectorizing, we need to construct the vector values.
	// If the value is known to be uniform after vectorization, we can just
	// broadcast the scalar value corresponding to lane zero for each unroll
	// iteration. Otherwise, we construct the vector values using insertelement
	// instructions. Since the resulting vectors are stored in
	// VectorLoopValueMap, we will only generate the insertelements once.
	Value *VectorValue = nullptr;
	if (Cost->isUniformAfterVectorization(I, VF)) {
	VectorValue = getBroadcastInstrs(ScalarValue);
	VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
	} else {
	// Initialize packing with insertelements to start from undef.
	Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
	VectorLoopValueMap.setVectorValue(V, Part, Undef);
	for (unsigned Lane = 0; Lane < VF; ++Lane)
	packScalarIntoVectorValue(V, {Part, Lane});
	VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
	}
	Builder.restoreIP(OldIP);
	return VectorValue;
	}

	// If this scalar is unknown, assume that it is a constant or that it is
	// loop invariant. Broadcast V and save the value for future uses.
	Value *B = getBroadcastInstrs(V);
	VectorLoopValueMap.setVectorValue(V, Part, B);
	return B;
	}

	Value *
	InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
	const VPIteration &Instance) {
	// If the value is not an instruction contained in the loop, it should
	// already be scalar.
	if (OrigLoop->isLoopInvariant(V))
	return V;

	assert(Instance.Lane > 0
	? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
	: true && "Uniform values only have lane zero");

	// If the value from the original loop has not been vectorized, it is
	// represented by UF x VF scalar values in the new loop. Return the requested
	// scalar value.
	if (VectorLoopValueMap.hasScalarValue(V, Instance))
	return VectorLoopValueMap.getScalarValue(V, Instance);

	// If the value has not been scalarized, get its entry in VectorLoopValueMap
	// for the given unroll part. If this entry is not a vector type (i.e., the
	// vectorization factor is one), there is no need to generate an
	// extractelement instruction.
	auto *U = getOrCreateVectorValue(V, Instance.Part);
	if (!U->getType()->isVectorTy()) {
	assert(VF == 1 && "Value not scalarized has non-vector type");
	return U;
	}

	// Otherwise, the value from the original loop has been vectorized and is
	// represented by UF vector values. Extract and return the requested scalar
	// value from the appropriate vector lane.
	return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
	}

	void InnerLoopVectorizer::packScalarIntoVectorValue(
	Value *V, const VPIteration &Instance) {
	assert(V != Induction && "The new induction variable should not be used.");
	assert(!V->getType()->isVectorTy() && "Can't pack a vector");
	assert(!V->getType()->isVoidTy() && "Type does not produce a value");

	Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
	Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
	VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
	Builder.getInt32(Instance.Lane));
	VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
	}

	Value InnerLoopVectorizer::reverseVector(Value Vec) {
	assert(Vec->getType()->isVectorTy() && "Invalid type");
	SmallVector<Constant *, 8> ShuffleMask;
	for (unsigned i = 0; i < VF; ++i)
	ShuffleMask.push_back(Builder.getInt32(VF - i - 1));

	return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
	ConstantVector::get(ShuffleMask),
	"reverse");
	}

	// Try to vectorize the interleave group that \p Instr belongs to.
	//
	// E.g. Translate following interleaved load group (factor = 3):
	// for (i = 0; i < N; i+=3) {
	// R = Pic[i]; // Member of index 0
	// G = Pic[i+1]; // Member of index 1
	// B = Pic[i+2]; // Member of index 2
	// ... // do something to R, G, B
	// }
	// To:
	// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
	// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
	// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
	// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
	//
	// Or translate following interleaved store group (factor = 3):
	// for (i = 0; i < N; i+=3) {
	// ... do something to R, G, B
	// Pic[i] = R; // Member of index 0
	// Pic[i+1] = G; // Member of index 1
	// Pic[i+2] = B; // Member of index 2
	// }
	// To:
	// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
	// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
	// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
	// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
	// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
	void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
	const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
	assert(Group && "Fail to get an interleaved access group.");

	// Skip if current instruction is not the insert position.
	if (Instr != Group->getInsertPos())
	return;

	const DataLayout &DL = Instr->getModule()->getDataLayout();
	Value *Ptr = getPointerOperand(Instr);

	// Prepare for the vector type of the interleaved load/store.
	Type *ScalarTy = getMemInstValueType(Instr);
	unsigned InterleaveFactor = Group->getFactor();
	Type VecTy = VectorType::get(ScalarTy, InterleaveFactor VF);
	Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));

	// Prepare for the new pointers.
	setDebugLocFromInst(Builder, Ptr);
	SmallVector<Value *, 2> NewPtrs;
	unsigned Index = Group->getIndex(Instr);

	// If the group is reverse, adjust the index to refer to the last vector lane
	// instead of the first. We adjust the index from the first vector lane,
	// rather than directly getting the pointer for lane VF - 1, because the
	// pointer operand of the interleaved access is supposed to be uniform. For
	// uniform instructions, we're only required to generate a value for the
	// first vector lane in each unroll iteration.
	if (Group->isReverse())
	Index += (VF - 1) * Group->getFactor();

	for (unsigned Part = 0; Part < UF; Part++) {
	Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});

	// Notice current instruction could be any index. Need to adjust the address
	// to the member of index 0.
	//
	// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
	// b = A[i]; // Member of index 0
	// Current pointer is pointed to A[i+1], adjust it to A[i].
	//
	// E.g. A[i+1] = a; // Member of index 1
	// A[i] = b; // Member of index 0
	// A[i+2] = c; // Member of index 2 (Current instruction)
	// Current pointer is pointed to A[i+2], adjust it to A[i].
	NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));

	// Cast to the vector pointer type.
	NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
	}

	setDebugLocFromInst(Builder, Instr);
	Value *UndefVec = UndefValue::get(VecTy);

	// Vectorize the interleaved load group.
	if (isa<LoadInst>(Instr)) {
	// For each unroll part, create a wide load for the group.
	SmallVector<Value *, 2> NewLoads;
	for (unsigned Part = 0; Part < UF; Part++) {
	auto *NewLoad = Builder.CreateAlignedLoad(
	NewPtrs[Part], Group->getAlignment(), "wide.vec");
	Group->addMetadata(NewLoad);
	NewLoads.push_back(NewLoad);
	}

	// For each member in the group, shuffle out the appropriate data from the
	// wide loads.
	for (unsigned I = 0; I < InterleaveFactor; ++I) {
	Instruction *Member = Group->getMember(I);

	// Skip the gaps in the group.
	if (!Member)
	continue;

	Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
	for (unsigned Part = 0; Part < UF; Part++) {
	Value *StridedVec = Builder.CreateShuffleVector(
	NewLoads[Part], UndefVec, StrideMask, "strided.vec");

	// If this member has different type, cast the result type.
	if (Member->getType() != ScalarTy) {
	VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
	StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
	}

	if (Group->isReverse())
	StridedVec = reverseVector(StridedVec);

	VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
	}
	}
	return;
	}

	// The sub vector type for current instruction.
	VectorType *SubVT = VectorType::get(ScalarTy, VF);

	// Vectorize the interleaved store group.
	for (unsigned Part = 0; Part < UF; Part++) {
	// Collect the stored vector from each member.
	SmallVector<Value *, 4> StoredVecs;
	for (unsigned i = 0; i < InterleaveFactor; i++) {
	// Interleaved store group doesn't allow a gap, so each index has a member
	Instruction *Member = Group->getMember(i);
	assert(Member && "Fail to get a member from an interleaved store group");

	Value *StoredVec = getOrCreateVectorValue(
	cast<StoreInst>(Member)->getValueOperand(), Part);
	if (Group->isReverse())
	StoredVec = reverseVector(StoredVec);

	// If this member has different type, cast it to a unified type.

	if (StoredVec->getType() != SubVT)
	StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);

	StoredVecs.push_back(StoredVec);
	}

	// Concatenate all vectors into a wide vector.
	Value *WideVec = concatenateVectors(Builder, StoredVecs);

	// Interleave the elements in the wide vector.
	Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
	Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
	"interleaved.vec");

	Instruction *NewStoreInstr =
	Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());

	Group->addMetadata(NewStoreInstr);
	}
	}

	void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
	VectorParts *BlockInMask) {
	// Attempt to issue a wide load.
	LoadInst *LI = dyn_cast<LoadInst>(Instr);
	StoreInst *SI = dyn_cast<StoreInst>(Instr);

	assert((LI \|\| SI) && "Invalid Load/Store instruction");

	LoopVectorizationCostModel::InstWidening Decision =
	Cost->getWideningDecision(Instr, VF);
	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
	"CM decision should be taken at this point");
	if (Decision == LoopVectorizationCostModel::CM_Interleave)
	return vectorizeInterleaveGroup(Instr);

	Type *ScalarDataTy = getMemInstValueType(Instr);
	Type *DataTy = VectorType::get(ScalarDataTy, VF);
	Value *Ptr = getPointerOperand(Instr);
	unsigned Alignment = getMemInstAlignment(Instr);
	// An alignment of 0 means target abi alignment. We need to use the scalar's
	// target abi alignment in such a case.
	const DataLayout &DL = Instr->getModule()->getDataLayout();
	if (!Alignment)
	Alignment = DL.getABITypeAlignment(ScalarDataTy);
	unsigned AddressSpace = getMemInstAddressSpace(Instr);

	// Determine if the pointer operand of the access is either consecutive or
	// reverse consecutive.
	bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
	bool ConsecutiveStride =
	Reverse \|\| (Decision == LoopVectorizationCostModel::CM_Widen);
	bool CreateGatherScatter =
	(Decision == LoopVectorizationCostModel::CM_GatherScatter);

	// Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
	// gather/scatter. Otherwise Decision should have been to Scalarize.
	assert((ConsecutiveStride \|\| CreateGatherScatter) &&
	"The instruction should be scalarized");

	// Handle consecutive loads/stores.
	if (ConsecutiveStride)
	Ptr = getOrCreateScalarValue(Ptr, {0, 0});

	VectorParts Mask;
	bool isMaskRequired = BlockInMask;
	if (isMaskRequired)
	Mask = *BlockInMask;

	// Handle Stores:
	if (SI) {
	assert(!Legal->isUniform(SI->getPointerOperand()) &&
	"We do not allow storing to uniform addresses");
	setDebugLocFromInst(Builder, SI);

	for (unsigned Part = 0; Part < UF; ++Part) {
	Instruction *NewSI = nullptr;
	Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
	if (CreateGatherScatter) {
	Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
	Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
	NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
	MaskPart);
	} else {
	// Calculate the pointer for the specific unroll-part.
	Value *PartPtr =
	Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));

	if (Reverse) {
	// If we store to reverse consecutive memory locations, then we need
	// to reverse the order of elements in the stored value.
	StoredVal = reverseVector(StoredVal);
	// We don't want to update the value in the map as it might be used in
	// another expression. So don't call resetVectorValue(StoredVal).

	// If the address is consecutive but reversed, then the
	// wide store needs to start at the last vector element.
	PartPtr =
	Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
	PartPtr =
	Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
	if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
	Mask[Part] = reverseVector(Mask[Part]);
	}

	Value *VecPtr =
	Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));

	if (isMaskRequired)
	NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
	Mask[Part]);
	else
	NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
	}
	addMetadata(NewSI, SI);
	}
	return;
	}

	// Handle loads.
	assert(LI && "Must have a load instruction");
	setDebugLocFromInst(Builder, LI);
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *NewLI;
	if (CreateGatherScatter) {
	Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
	Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
	NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
	nullptr, "wide.masked.gather");
	addMetadata(NewLI, LI);
	} else {
	// Calculate the pointer for the specific unroll-part.
	Value *PartPtr =
	Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));

	if (Reverse) {
	// If the address is consecutive but reversed, then the
	// wide load needs to start at the last vector element.
	PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
	PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
	if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
	Mask[Part] = reverseVector(Mask[Part]);
	}

	Value *VecPtr =
	Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
	if (isMaskRequired)
	NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
	UndefValue::get(DataTy),
	"wide.masked.load");
	else
	NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");

	// Add metadata to the load, but setVectorValue to the reverse shuffle.
	addMetadata(NewLI, LI);
	if (Reverse)
	NewLI = reverseVector(NewLI);
	}
	VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
	}
	}

	void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
	const VPIteration &Instance,
	bool IfPredicateInstr) {
	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");

	setDebugLocFromInst(Builder, Instr);

	// Does this instruction return a value ?
	bool IsVoidRetTy = Instr->getType()->isVoidTy();

	Instruction *Cloned = Instr->clone();
	if (!IsVoidRetTy)
	Cloned->setName(Instr->getName() + ".cloned");

	// Replace the operands of the cloned instructions with their scalar
	// equivalents in the new loop.
	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
	auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
	Cloned->setOperand(op, NewOp);
	}
	addNewMetadata(Cloned, Instr);

	// Place the cloned scalar in the new loop.
	Builder.Insert(Cloned);

	// Add the cloned scalar to the scalar map entry.
	VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);

	// If we just cloned a new assumption, add it the assumption cache.
	if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
	if (II->getIntrinsicID() == Intrinsic::assume)
	AC->registerAssumption(II);

	// End if-block.
	if (IfPredicateInstr)
	PredicatedInstructions.push_back(Cloned);
	}

	PHINode InnerLoopVectorizer::createInductionVariable(Loop L, Value *Start,
	Value End, Value Step,
	Instruction *DL) {
	BasicBlock *Header = L->getHeader();
	BasicBlock *Latch = L->getLoopLatch();
	// As we're just creating this loop, it's possible no latch exists
	// yet. If so, use the header as this will be a single block loop.
	if (!Latch)
	Latch = Header;

	IRBuilder<> Builder(&*Header->getFirstInsertionPt());
	Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
	setDebugLocFromInst(Builder, OldInst);
	auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");

	Builder.SetInsertPoint(Latch->getTerminator());
	setDebugLocFromInst(Builder, OldInst);

	// Create i+1 and fill the PHINode.
	Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
	Induction->addIncoming(Start, L->getLoopPreheader());
	Induction->addIncoming(Next, Latch);
	// Create the compare.
	Value *ICmp = Builder.CreateICmpEQ(Next, End);
	Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);

	// Now we have two terminators. Remove the old one from the block.
	Latch->getTerminator()->eraseFromParent();

	return Induction;
	}

	Value InnerLoopVectorizer::getOrCreateTripCount(Loop L) {
	if (TripCount)
	return TripCount;

	IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
	// Find the loop boundaries.
	ScalarEvolution *SE = PSE.getSE();
	const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
	assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
	"Invalid loop count");

	Type *IdxTy = Legal->getWidestInductionType();

	// The exit count might have the type of i64 while the phi is i32. This can
	// happen if we have an induction variable that is sign extended before the
	// compare. The only way that we get a backedge taken count is that the
	// induction variable was signed and as such will not overflow. In such a case
	// truncation is legal.
	if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
	IdxTy->getPrimitiveSizeInBits())
	BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
	BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);

	// Get the total trip count from the count by adding 1.
	const SCEV *ExitCount = SE->getAddExpr(
	BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));

	const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();

	// Expand the trip count and place the new instructions in the preheader.
	// Notice that the pre-header does not change, only the loop body.
	SCEVExpander Exp(*SE, DL, "induction");

	// Count holds the overall loop count (N).
	TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
	L->getLoopPreheader()->getTerminator());

	if (TripCount->getType()->isPointerTy())
	TripCount =
	CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
	L->getLoopPreheader()->getTerminator());

	return TripCount;
	}

	Value InnerLoopVectorizer::getOrCreateVectorTripCount(Loop L) {
	if (VectorTripCount)
	return VectorTripCount;

	Value *TC = getOrCreateTripCount(L);
	IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());

	// Now we need to generate the expression for the part of the loop that the
	// vectorized body will execute. This is equal to N - (N % Step) if scalar
	// iterations are not required for correctness, or N - Step, otherwise. Step
	// is equal to the vectorization factor (number of SIMD elements) times the
	// unroll factor (number of SIMD instructions).
	Constant Step = ConstantInt::get(TC->getType(), VF UF);
	Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");

	// If there is a non-reversed interleaved group that may speculatively access
	// memory out-of-bounds, we need to ensure that there will be at least one
	// iteration of the scalar epilogue loop. Thus, if the step evenly divides
	// the trip count, we set the remainder to be equal to the step. If the step
	// does not evenly divide the trip count, no adjustment is necessary since
	// there will already be scalar iterations. Note that the minimum iterations
	// check ensures that N >= Step.
	if (VF > 1 && Legal->requiresScalarEpilogue()) {
	auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
	R = Builder.CreateSelect(IsZero, Step, R);
	}

	VectorTripCount = Builder.CreateSub(TC, R, "n.vec");

	return VectorTripCount;
	}

	Value InnerLoopVectorizer::createBitOrPointerCast(Value V, VectorType *DstVTy,
	const DataLayout &DL) {
	// Verify that V is a vector type with same number of elements as DstVTy.
	unsigned VF = DstVTy->getNumElements();
	VectorType *SrcVecTy = cast<VectorType>(V->getType());
	assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
	Type *SrcElemTy = SrcVecTy->getElementType();
	Type *DstElemTy = DstVTy->getElementType();
	assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
	"Vector elements must have same size");

	// Do a direct cast if element types are castable.
	if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
	return Builder.CreateBitOrPointerCast(V, DstVTy);
	}
	// V cannot be directly casted to desired vector type.
	// May happen when V is a floating point vector but DstVTy is a vector of
	// pointers or vice-versa. Handle this using a two-step bitcast using an
	// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
	assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
	"Only one type should be a pointer type");
	assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
	"Only one type should be a floating point type");
	Type *IntTy =
	IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
	VectorType *VecIntTy = VectorType::get(IntTy, VF);
	Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
	return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
	}

	void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
	BasicBlock *Bypass) {
	Value *Count = getOrCreateTripCount(L);
	BasicBlock *BB = L->getLoopPreheader();
	IRBuilder<> Builder(BB->getTerminator());

	// Generate code to check if the loop's trip count is less than VF * UF, or
	// equal to it in case a scalar epilogue is required; this implies that the
	// vector trip count is zero. This check also covers the case where adding one
	// to the backedge-taken count overflowed leading to an incorrect trip count
	// of zero. In this case we will also jump to the scalar loop.
	auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
	: ICmpInst::ICMP_ULT;
	Value *CheckMinIters = Builder.CreateICmp(
	P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");

	BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
	// Update dominator tree immediately if the generated block is a
	// LoopBypassBlock because SCEV expansions to generate loop bypass
	// checks may query it before the current function is finished.
	DT->addNewBlock(NewBB, BB);
	if (L->getParentLoop())
	L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
	ReplaceInstWithInst(BB->getTerminator(),
	BranchInst::Create(Bypass, NewBB, CheckMinIters));
	LoopBypassBlocks.push_back(BB);
	}

	void InnerLoopVectorizer::emitSCEVChecks(Loop L, BasicBlock Bypass) {
	BasicBlock *BB = L->getLoopPreheader();

	// Generate the code to check that the SCEV assumptions that we made.
	// We want the new basic block to start at the first instruction in a
	// sequence of instructions that form a check.
	SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
	"scev.check");
	Value *SCEVCheck =
	Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());

	if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
	if (C->isZero())
	return;

	// Create a new block containing the stride check.
	BB->setName("vector.scevcheck");
	auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
	// Update dominator tree immediately if the generated block is a
	// LoopBypassBlock because SCEV expansions to generate loop bypass
	// checks may query it before the current function is finished.
	DT->addNewBlock(NewBB, BB);
	if (L->getParentLoop())
	L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
	ReplaceInstWithInst(BB->getTerminator(),
	BranchInst::Create(Bypass, NewBB, SCEVCheck));
	LoopBypassBlocks.push_back(BB);
	AddedSafetyChecks = true;
	}

	void InnerLoopVectorizer::emitMemRuntimeChecks(Loop L, BasicBlock Bypass) {
	BasicBlock *BB = L->getLoopPreheader();

	// Generate the code that checks in runtime if arrays overlap. We put the
	// checks into a separate block to make the more common case of few elements
	// faster.
	Instruction *FirstCheckInst;
	Instruction *MemRuntimeCheck;
	std::tie(FirstCheckInst, MemRuntimeCheck) =
	Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
	if (!MemRuntimeCheck)
	return;

	// Create a new block containing the memory check.
	BB->setName("vector.memcheck");
	auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
	// Update dominator tree immediately if the generated block is a
	// LoopBypassBlock because SCEV expansions to generate loop bypass
	// checks may query it before the current function is finished.
	DT->addNewBlock(NewBB, BB);
	if (L->getParentLoop())
	L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
	ReplaceInstWithInst(BB->getTerminator(),
	BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
	LoopBypassBlocks.push_back(BB);
	AddedSafetyChecks = true;

	// We currently don't use LoopVersioning for the actual loop cloning but we
	// still use it to add the noalias metadata.
	LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
	PSE.getSE());
	LVer->prepareNoAliasMetadata();
	}

	BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
	/*
	In this function we generate a new loop. The new loop will contain
	the vectorized instructions while the old loop will continue to run the
	scalar remainder.

	[ ] <-- loop iteration number check.
	/ \|
	/ v
	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
	\| / \|
	\| / v
	\|\| [ ] <-- vector pre header.
	\|/ \|
	\| v
	\| [ ] \
	\| [ ]_\| <-- vector loop.
	\| \|
	\| v
	\| -[ ] <--- middle-block.
	\| / \|
	\| / v
	-\|- >[ ] <--- new preheader.
	\| \|
	\| v
	\| [ ] \
	\| [ ]_\| <-- old scalar loop to handle remainder.
	\ \|
	\ v
	>[ ] <-- exit block.
	...
	*/

	BasicBlock *OldBasicBlock = OrigLoop->getHeader();
	BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
	BasicBlock *ExitBlock = OrigLoop->getExitBlock();
	assert(VectorPH && "Invalid loop structure");
	assert(ExitBlock && "Must have an exit block");

	// Some loops have a single integer induction variable, while other loops
	// don't. One example is c++ iterators that often have multiple pointer
	// induction variables. In the code below we also support a case where we
	// don't have a single induction variable.
	//
	// We try to obtain an induction variable from the original loop as hard
	// as possible. However if we don't find one that:
	// - is an integer
	// - counts from zero, stepping by one
	// - is the size of the widest induction variable type
	// then we create a new one.
	OldInduction = Legal->getPrimaryInduction();
	Type *IdxTy = Legal->getWidestInductionType();

	// Split the single block loop into the two loop structure described above.
	BasicBlock *VecBody =
	VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
	BasicBlock *MiddleBlock =
	VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
	BasicBlock *ScalarPH =
	MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");

	// Create and register the new vector loop.
	Loop *Lp = LI->AllocateLoop();
	Loop *ParentLoop = OrigLoop->getParentLoop();

	// Insert the new loop into the loop nest and register the new basic blocks
	// before calling any utilities such as SCEV that require valid LoopInfo.
	if (ParentLoop) {
	ParentLoop->addChildLoop(Lp);
	ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
	ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
	} else {
	LI->addTopLevelLoop(Lp);
	}
	Lp->addBasicBlockToLoop(VecBody, *LI);

	// Find the loop boundaries.
	Value *Count = getOrCreateTripCount(Lp);

	Value *StartIdx = ConstantInt::get(IdxTy, 0);

	// Now, compare the new count to zero. If it is zero skip the vector loop and
	// jump to the scalar loop. This check also covers the case where the
	// backedge-taken count is uint##_max: adding one to it will overflow leading
	// to an incorrect trip count of zero. In this (rare) case we will also jump
	// to the scalar loop.
	emitMinimumIterationCountCheck(Lp, ScalarPH);

	// Generate the code to check any assumptions that we've made for SCEV
	// expressions.
	emitSCEVChecks(Lp, ScalarPH);

	// Generate the code that checks in runtime if arrays overlap. We put the
	// checks into a separate block to make the more common case of few elements
	// faster.
	emitMemRuntimeChecks(Lp, ScalarPH);

	// Generate the induction variable.
	// The loop step is equal to the vectorization factor (num of SIMD elements)
	// times the unroll factor (num of SIMD instructions).
	Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
	Constant Step = ConstantInt::get(IdxTy, VF UF);
	Induction =
	createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
	getDebugLocFromInstOrOperands(OldInduction));

	// We are going to resume the execution of the scalar loop.
	// Go over all of the induction variables that we found and fix the
	// PHIs that are left in the scalar version of the loop.
	// The starting values of PHI nodes depend on the counter of the last
	// iteration in the vectorized loop.
	// If we come from a bypass edge then we need to start from the original
	// start value.

	// This variable saves the new starting index for the scalar loop. It is used
	// to test if there are any tail iterations left once the vector loop has
	// completed.
	LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
	for (auto &InductionEntry : *List) {
	PHINode *OrigPhi = InductionEntry.first;
	InductionDescriptor II = InductionEntry.second;

	// Create phi nodes to merge from the backedge-taken check block.
	PHINode *BCResumeVal = PHINode::Create(
	OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
	Value *&EndValue = IVEndValues[OrigPhi];
	if (OrigPhi == OldInduction) {
	// We know what the end value is.
	EndValue = CountRoundDown;
	} else {
	IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
	Type *StepType = II.getStep()->getType();
	Instruction::CastOps CastOp =
	CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
	Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
	const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
	EndValue = II.transform(B, CRD, PSE.getSE(), DL);
	EndValue->setName("ind.end");
	}

	// The new PHI merges the original incoming value, in case of a bypass,
	// or the value at the end of the vectorized loop.
	BCResumeVal->addIncoming(EndValue, MiddleBlock);

	// Fix the scalar body counter (PHI node).
	unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);

	// The old induction's phi node in the scalar body needs the truncated
	// value.
	for (BasicBlock *BB : LoopBypassBlocks)
	BCResumeVal->addIncoming(II.getStartValue(), BB);
	OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
	}

	// Add a check in the middle block to see if we have completed
	// all of the iterations in the first vector loop.
	// If (N - N%VF) == N, then we don't need to run the remainder.
	Value *CmpN =
	CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
	CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
	ReplaceInstWithInst(MiddleBlock->getTerminator(),
	BranchInst::Create(ExitBlock, ScalarPH, CmpN));

	// Get ready to start creating new instructions into the vectorized body.
	Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());

	// Save the state.
	LoopVectorPreHeader = Lp->getLoopPreheader();
	LoopScalarPreHeader = ScalarPH;
	LoopMiddleBlock = MiddleBlock;
	LoopExitBlock = ExitBlock;
	LoopVectorBody = VecBody;
	LoopScalarBody = OldBasicBlock;

	// Keep all loop hints from the original loop on the vector loop (we'll
	// replace the vectorizer-specific hints below).
	if (MDNode *LID = OrigLoop->getLoopID())
	Lp->setLoopID(LID);

	LoopVectorizeHints Hints(Lp, true, *ORE);
	Hints.setAlreadyVectorized();

	return LoopVectorPreHeader;
	}

	// Fix up external users of the induction variable. At this point, we are
	// in LCSSA form, with all external PHIs that use the IV having one input value,
	// coming from the remainder loop. We need those PHIs to also have a correct
	// value for the IV when arriving directly from the middle block.
	void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
	const InductionDescriptor &II,
	Value CountRoundDown, Value EndValue,
	BasicBlock *MiddleBlock) {
	// There are two kinds of external IV usages - those that use the value
	// computed in the last iteration (the PHI) and those that use the penultimate
	// value (the value that feeds into the phi from the loop latch).
	// We allow both, but they, obviously, have different values.

	assert(OrigLoop->getExitBlock() && "Expected a single exit block");

	DenseMap<Value , Value > MissingVals;

	// An external user of the last iteration's value should see the value that
	// the remainder loop uses to initialize its own IV.
	Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
	for (User *U : PostInc->users()) {
	Instruction *UI = cast<Instruction>(U);
	if (!OrigLoop->contains(UI)) {
	assert(isa<PHINode>(UI) && "Expected LCSSA form");
	MissingVals[UI] = EndValue;
	}
	}

	// An external user of the penultimate value need to see EndValue - Step.
	// The simplest way to get this is to recompute it from the constituent SCEVs,
	// that is Start + (Step * (CRD - 1)).
	for (User *U : OrigPhi->users()) {
	auto *UI = cast<Instruction>(U);
	if (!OrigLoop->contains(UI)) {
	const DataLayout &DL =
	OrigLoop->getHeader()->getModule()->getDataLayout();
	assert(isa<PHINode>(UI) && "Expected LCSSA form");

	IRBuilder<> B(MiddleBlock->getTerminator());
	Value *CountMinusOne = B.CreateSub(
	CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
	Value *CMO =
	!II.getStep()->getType()->isIntegerTy()
	? B.CreateCast(Instruction::SIToFP, CountMinusOne,
	II.getStep()->getType())
	: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
	CMO->setName("cast.cmo");
	Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
	Escape->setName("ind.escape");
	MissingVals[UI] = Escape;
	}
	}

	for (auto &I : MissingVals) {
	PHINode *PHI = cast<PHINode>(I.first);
	// One corner case we have to handle is two IVs "chasing" each-other,
	// that is %IV2 = phi [...], [ %IV1, %latch ]
	// In this case, if IV1 has an external use, we need to avoid adding both
	// "last value of IV1" and "penultimate value of IV2". So, verify that we
	// don't already have an incoming value for the middle block.
	if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
	PHI->addIncoming(I.second, MiddleBlock);
	}
	}

	namespace {

	struct CSEDenseMapInfo {
	static bool canHandle(const Instruction *I) {
	return isa<InsertElementInst>(I) \|\| isa<ExtractElementInst>(I) \|\|
	isa<ShuffleVectorInst>(I) \|\| isa<GetElementPtrInst>(I);
	}

	static inline Instruction *getEmptyKey() {
	return DenseMapInfo<Instruction *>::getEmptyKey();
	}

	static inline Instruction *getTombstoneKey() {
	return DenseMapInfo<Instruction *>::getTombstoneKey();
	}

	static unsigned getHashValue(const Instruction *I) {
	assert(canHandle(I) && "Unknown instruction!");
	return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
	I->value_op_end()));
	}

	static bool isEqual(const Instruction LHS, const Instruction RHS) {
	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
	return LHS == RHS;
	return LHS->isIdenticalTo(RHS);
	}
	};

	} // end anonymous namespace

	///\brief Perform cse of induction variable instructions.
	static void cse(BasicBlock *BB) {
	// Perform simple cse.
	SmallDenseMap<Instruction , Instruction , 4, CSEDenseMapInfo> CSEMap;
	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
	Instruction In = &I++;

	if (!CSEDenseMapInfo::canHandle(In))
	continue;

	// Check if we can replace this instruction with any of the
	// visited instructions.
	if (Instruction *V = CSEMap.lookup(In)) {
	In->replaceAllUsesWith(V);
	In->eraseFromParent();
	continue;
	}

	CSEMap[In] = In;
	}
	}

	/// \brief Estimate the overhead of scalarizing an instruction. This is a
	/// convenience wrapper for the type-based getScalarizationOverhead API.
	static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
	const TargetTransformInfo &TTI) {
	if (VF == 1)
	return 0;

	unsigned Cost = 0;
	Type *RetTy = ToVectorTy(I->getType(), VF);
	if (!RetTy->isVoidTy() &&
	(!isa<LoadInst>(I) \|\|
	!TTI.supportsEfficientVectorElementLoadStore()))
	Cost += TTI.getScalarizationOverhead(RetTy, true, false);

	if (CallInst *CI = dyn_cast<CallInst>(I)) {
	SmallVector<const Value *, 4> Operands(CI->arg_operands());
	Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
	}
	else if (!isa<StoreInst>(I) \|\|
	!TTI.supportsEfficientVectorElementLoadStore()) {
	SmallVector<const Value *, 4> Operands(I->operand_values());
	Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
	}

	return Cost;
	}

	// Estimate cost of a call instruction CI if it were vectorized with factor VF.
	// Return the cost of the instruction, including scalarization overhead if it's
	// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
	// i.e. either vector version isn't available, or is too expensive.
	static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
	const TargetTransformInfo &TTI,
	const TargetLibraryInfo *TLI,
	bool &NeedToScalarize) {
	Function *F = CI->getCalledFunction();
	StringRef FnName = CI->getCalledFunction()->getName();
	Type *ScalarRetTy = CI->getType();
	SmallVector<Type *, 4> Tys, ScalarTys;
	for (auto &ArgOp : CI->arg_operands())
	ScalarTys.push_back(ArgOp->getType());

	// Estimate cost of scalarized vector call. The source operands are assumed
	// to be vectors, so we need to extract individual elements from there,
	// execute VF scalar calls, and then gather the result into the vector return
	// value.
	unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
	if (VF == 1)
	return ScalarCallCost;

	// Compute corresponding vector type for return value and arguments.
	Type *RetTy = ToVectorTy(ScalarRetTy, VF);
	for (Type *ScalarTy : ScalarTys)
	Tys.push_back(ToVectorTy(ScalarTy, VF));

	// Compute costs of unpacking argument values for the scalar calls and
	// packing the return values to a vector.
	unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);

	unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

	// If we can't emit a vector call for this function, then the currently found
	// cost is the cost we need to return.
	NeedToScalarize = true;
	if (!TLI \|\| !TLI->isFunctionVectorizable(FnName, VF) \|\| CI->isNoBuiltin())
	return Cost;

	// If the corresponding vector cost is cheaper, return its cost.
	unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
	if (VectorCallCost < Cost) {
	NeedToScalarize = false;
	return VectorCallCost;
	}
	return Cost;
	}

	// Estimate cost of an intrinsic call instruction CI if it were vectorized with
	// factor VF. Return the cost of the instruction, including scalarization
	// overhead if it's needed.
	static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
	const TargetTransformInfo &TTI,
	const TargetLibraryInfo *TLI) {
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	assert(ID && "Expected intrinsic call!");

	FastMathFlags FMF;
	if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
	FMF = FPMO->getFastMathFlags();

	SmallVector<Value *, 4> Operands(CI->arg_operands());
	return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
	}

	static Type smallestIntegerVectorType(Type T1, Type *T2) {
	auto *I1 = cast<IntegerType>(T1->getVectorElementType());
	auto *I2 = cast<IntegerType>(T2->getVectorElementType());
	return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
	}
	static Type largestIntegerVectorType(Type T1, Type *T2) {
	auto *I1 = cast<IntegerType>(T1->getVectorElementType());
	auto *I2 = cast<IntegerType>(T2->getVectorElementType());
	return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
	}

	void InnerLoopVectorizer::truncateToMinimalBitwidths() {
	// For every instruction `I` in MinBWs, truncate the operands, create a
	// truncated version of `I` and reextend its result. InstCombine runs
	// later and will remove any ext/trunc pairs.
	SmallPtrSet<Value *, 4> Erased;
	for (const auto &KV : Cost->getMinimalBitwidths()) {
	// If the value wasn't vectorized, we must maintain the original scalar
	// type. The absence of the value from VectorLoopValueMap indicates that it
	// wasn't vectorized.
	if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
	continue;
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *I = getOrCreateVectorValue(KV.first, Part);
	if (Erased.count(I) \|\| I->use_empty() \|\| !isa<Instruction>(I))
	continue;
	Type *OriginalTy = I->getType();
	Type *ScalarTruncatedTy =
	IntegerType::get(OriginalTy->getContext(), KV.second);
	Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
	OriginalTy->getVectorNumElements());
	if (TruncatedTy == OriginalTy)
	continue;

	IRBuilder<> B(cast<Instruction>(I));
	auto ShrinkOperand = [&](Value V) -> Value {
	if (auto *ZI = dyn_cast<ZExtInst>(V))
	if (ZI->getSrcTy() == TruncatedTy)
	return ZI->getOperand(0);
	return B.CreateZExtOrTrunc(V, TruncatedTy);
	};

	// The actual instruction modification depends on the instruction type,
	// unfortunately.
	Value *NewI = nullptr;
	if (auto *BO = dyn_cast<BinaryOperator>(I)) {
	NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
	ShrinkOperand(BO->getOperand(1)));

	// Any wrapping introduced by shrinking this operation shouldn't be
	// considered undefined behavior. So, we can't unconditionally copy
	// arithmetic wrapping flags to NewI.
	cast<BinaryOperator>(NewI)->copyIRFlags(I, /IncludeWrapFlags=/false);
	} else if (auto *CI = dyn_cast<ICmpInst>(I)) {
	NewI =
	B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
	ShrinkOperand(CI->getOperand(1)));
	} else if (auto *SI = dyn_cast<SelectInst>(I)) {
	NewI = B.CreateSelect(SI->getCondition(),
	ShrinkOperand(SI->getTrueValue()),
	ShrinkOperand(SI->getFalseValue()));
	} else if (auto *CI = dyn_cast<CastInst>(I)) {
	switch (CI->getOpcode()) {
	default:
	llvm_unreachable("Unhandled cast!");
	case Instruction::Trunc:
	NewI = ShrinkOperand(CI->getOperand(0));
	break;
	case Instruction::SExt:
	NewI = B.CreateSExtOrTrunc(
	CI->getOperand(0),
	smallestIntegerVectorType(OriginalTy, TruncatedTy));
	break;
	case Instruction::ZExt:
	NewI = B.CreateZExtOrTrunc(
	CI->getOperand(0),
	smallestIntegerVectorType(OriginalTy, TruncatedTy));
	break;
	}
	} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
	auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
	auto *O0 = B.CreateZExtOrTrunc(
	SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
	auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
	auto *O1 = B.CreateZExtOrTrunc(
	SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));

	NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
	} else if (isa<LoadInst>(I)) {
	// Don't do anything with the operands, just extend the result.
	continue;
	} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
	auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
	auto *O0 = B.CreateZExtOrTrunc(
	IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
	auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
	NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
	} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
	auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
	auto *O0 = B.CreateZExtOrTrunc(
	EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
	NewI = B.CreateExtractElement(O0, EE->getOperand(2));
	} else {
	llvm_unreachable("Unhandled instruction type!");
	}

	// Lastly, extend the result.
	NewI->takeName(cast<Instruction>(I));
	Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
	I->replaceAllUsesWith(Res);
	cast<Instruction>(I)->eraseFromParent();
	Erased.insert(I);
	VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
	}
	}

	// We'll have created a bunch of ZExts that are now parentless. Clean up.
	for (const auto &KV : Cost->getMinimalBitwidths()) {
	// If the value wasn't vectorized, we must maintain the original scalar
	// type. The absence of the value from VectorLoopValueMap indicates that it
	// wasn't vectorized.
	if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
	continue;
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *I = getOrCreateVectorValue(KV.first, Part);
	ZExtInst *Inst = dyn_cast<ZExtInst>(I);
	if (Inst && Inst->use_empty()) {
	Value *NewI = Inst->getOperand(0);
	Inst->eraseFromParent();
	VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
	}
	}
	}
	}

	void InnerLoopVectorizer::fixVectorizedLoop() {
	// Insert truncates and extends for any truncated instructions as hints to
	// InstCombine.
	if (VF > 1)
	truncateToMinimalBitwidths();

	// At this point every instruction in the original loop is widened to a
	// vector form. Now we need to fix the recurrences in the loop. These PHI
	// nodes are currently empty because we did not want to introduce cycles.
	// This is the second stage of vectorizing recurrences.
	fixCrossIterationPHIs();

	// Update the dominator tree.
	//
	// FIXME: After creating the structure of the new loop, the dominator tree is
	// no longer up-to-date, and it remains that way until we update it
	// here. An out-of-date dominator tree is problematic for SCEV,
	// because SCEVExpander uses it to guide code generation. The
	// vectorizer use SCEVExpanders in several places. Instead, we should
	// keep the dominator tree up-to-date as we go.
	updateAnalysis();

	// Fix-up external users of the induction variables.
	for (auto &Entry : *Legal->getInductionVars())
	fixupIVUsers(Entry.first, Entry.second,
	getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
	IVEndValues[Entry.first], LoopMiddleBlock);

	fixLCSSAPHIs();
	for (Instruction *PI : PredicatedInstructions)
	sinkScalarOperands(&*PI);

	// Remove redundant induction instructions.
	cse(LoopVectorBody);
	}

	void InnerLoopVectorizer::fixCrossIterationPHIs() {
	// In order to support recurrences we need to be able to vectorize Phi nodes.
	// Phi nodes have cycles, so we need to vectorize them in two stages. This is
	// stage #2: We now need to fix the recurrences by adding incoming edges to
	// the currently empty PHI nodes. At this point every instruction in the
	// original loop is widened to a vector form so we can use them to construct
	// the incoming edges.
	for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
	// Handle first-order recurrences and reductions that need to be fixed.
	if (Legal->isFirstOrderRecurrence(&Phi))
	fixFirstOrderRecurrence(&Phi);
	else if (Legal->isReductionVariable(&Phi))
	fixReduction(&Phi);
	}
	}

	void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
	// This is the second phase of vectorizing first-order recurrences. An
	// overview of the transformation is described below. Suppose we have the
	// following loop.
	//
	// for (int i = 0; i < n; ++i)
	// b[i] = a[i] - a[i - 1];
	//
	// There is a first-order recurrence on "a". For this loop, the shorthand
	// scalar IR looks like:
	//
	// scalar.ph:
	// s_init = a[-1]
	// br scalar.body
	//
	// scalar.body:
	// i = phi [0, scalar.ph], [i+1, scalar.body]
	// s1 = phi [s_init, scalar.ph], [s2, scalar.body]
	// s2 = a[i]
	// b[i] = s2 - s1
	// br cond, scalar.body, ...
	//
	// In this example, s1 is a recurrence because it's value depends on the
	// previous iteration. In the first phase of vectorization, we created a
	// temporary value for s1. We now complete the vectorization and produce the
	// shorthand vector IR shown below (for VF = 4, UF = 1).
	//
	// vector.ph:
	// v_init = vector(..., ..., ..., a[-1])
	// br vector.body
	//
	// vector.body
	// i = phi [0, vector.ph], [i+4, vector.body]
	// v1 = phi [v_init, vector.ph], [v2, vector.body]
	// v2 = a[i, i+1, i+2, i+3];
	// v3 = vector(v1(3), v2(0, 1, 2))
	// b[i, i+1, i+2, i+3] = v2 - v3
	// br cond, vector.body, middle.block
	//
	// middle.block:
	// x = v2(3)
	// br scalar.ph
	//
	// scalar.ph:
	// s_init = phi [x, middle.block], [a[-1], otherwise]
	// br scalar.body
	//
	// After execution completes the vector loop, we extract the next value of
	// the recurrence (x) to use as the initial value in the scalar loop.

	// Get the original loop preheader and single loop latch.
	auto *Preheader = OrigLoop->getLoopPreheader();
	auto *Latch = OrigLoop->getLoopLatch();

	// Get the initial and previous values of the scalar recurrence.
	auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
	auto *Previous = Phi->getIncomingValueForBlock(Latch);

	// Create a vector from the initial value.
	auto *VectorInit = ScalarInit;
	if (VF > 1) {
	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
	VectorInit = Builder.CreateInsertElement(
	UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
	Builder.getInt32(VF - 1), "vector.recur.init");
	}

	// We constructed a temporary phi node in the first phase of vectorization.
	// This phi node will eventually be deleted.
	Builder.SetInsertPoint(
	cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));

	// Create a phi node for the new recurrence. The current value will either be
	// the initial value inserted into a vector or loop-varying vector value.
	auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
	VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);

	// Get the vectorized previous value of the last part UF - 1. It appears last
	// among all unrolled iterations, due to the order of their construction.
	Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);

	// Set the insertion point after the previous value if it is an instruction.
	// Note that the previous value may have been constant-folded so it is not
	// guaranteed to be an instruction in the vector loop. Also, if the previous
	// value is a phi node, we should insert after all the phi nodes to avoid
	// breaking basic block verification.
	if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) \|\|
	isa<PHINode>(PreviousLastPart))
	Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
	else
	Builder.SetInsertPoint(
	&*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));

	// We will construct a vector for the recurrence by combining the values for
	// the current and previous iterations. This is the required shuffle mask.
	SmallVector<Constant *, 8> ShuffleMask(VF);
	ShuffleMask[0] = Builder.getInt32(VF - 1);
	for (unsigned I = 1; I < VF; ++I)
	ShuffleMask[I] = Builder.getInt32(I + VF - 1);

	// The vector from which to take the initial value for the current iteration
	// (actual or unrolled). Initially, this is the vector phi node.
	Value *Incoming = VecPhi;

	// Shuffle the current and previous vector and update the vector parts.
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
	Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
	auto *Shuffle =
	VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
	ConstantVector::get(ShuffleMask))
	: Incoming;
	PhiPart->replaceAllUsesWith(Shuffle);
	cast<Instruction>(PhiPart)->eraseFromParent();
	VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
	Incoming = PreviousPart;
	}

	// Fix the latch value of the new recurrence in the vector loop.
	VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());

	// Extract the last vector element in the middle block. This will be the
	// initial value for the recurrence when jumping to the scalar loop.
	auto *ExtractForScalar = Incoming;
	if (VF > 1) {
	Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
	ExtractForScalar = Builder.CreateExtractElement(
	ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
	}
	// Extract the second last element in the middle block if the
	// Phi is used outside the loop. We need to extract the phi itself
	// and not the last element (the phi update in the current iteration). This
	// will be the value when jumping to the exit block from the LoopMiddleBlock,
	// when the scalar loop is not run at all.
	Value *ExtractForPhiUsedOutsideLoop = nullptr;
	if (VF > 1)
	ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
	Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
	// When loop is unrolled without vectorizing, initialize
	// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
	// `Incoming`. This is analogous to the vectorized case above: extracting the
	// second last element when VF > 1.
	else if (UF > 1)
	ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);

	// Fix the initial value of the original recurrence in the scalar loop.
	Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
	auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
	for (auto *BB : predecessors(LoopScalarPreHeader)) {
	auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
	Start->addIncoming(Incoming, BB);
	}

	Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
	Phi->setName("scalar.recur");

	// Finally, fix users of the recurrence outside the loop. The users will need
	// either the last value of the scalar recurrence or the last value of the
	// vector recurrence we extracted in the middle block. Since the loop is in
	// LCSSA form, we just need to find the phi node for the original scalar
	// recurrence in the exit block, and then add an edge for the middle block.
	for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
	if (LCSSAPhi.getIncomingValue(0) == Phi) {
	LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
	break;
	}
	}
	}

	void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
	Constant *Zero = Builder.getInt32(0);

	// Get it's reduction variable descriptor.
	assert(Legal->isReductionVariable(Phi) &&
	"Unable to find the reduction variable");
	RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];

	RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
	TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
	Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
	RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
	RdxDesc.getMinMaxRecurrenceKind();
	setDebugLocFromInst(Builder, ReductionStartValue);

	// We need to generate a reduction vector from the incoming scalar.
	// To do so, we need to generate the 'identity' vector and override
	// one of the elements with the incoming scalar reduction. We need
	// to do it in the vector-loop preheader.
	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

	// This is the vector-clone of the value that leaves the loop.
	Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();

	// Find the reduction identity variable. Zero for addition, or, xor,
	// one for multiplication, -1 for And.
	Value *Identity;
	Value *VectorStart;
	if (RK == RecurrenceDescriptor::RK_IntegerMinMax \|\|
	RK == RecurrenceDescriptor::RK_FloatMinMax) {
	// MinMax reduction have the start value as their identify.
	if (VF == 1) {
	VectorStart = Identity = ReductionStartValue;
	} else {
	VectorStart = Identity =
	Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
	}
	} else {
	// Handle other reduction kinds:
	Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
	RK, VecTy->getScalarType());
	if (VF == 1) {
	Identity = Iden;
	// This vector is the Identity vector where the first element is the
	// incoming scalar reduction.
	VectorStart = ReductionStartValue;
	} else {
	Identity = ConstantVector::getSplat(VF, Iden);

	// This vector is the Identity vector where the first element is the
	// incoming scalar reduction.
	VectorStart =
	Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
	}
	}

	// Fix the vector-loop phi.

	// Reductions do not have to start at zero. They can start with
	// any loop invariant values.
	BasicBlock *Latch = OrigLoop->getLoopLatch();
	Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
	Value *Val = getOrCreateVectorValue(LoopVal, Part);
	// Make sure to add the reduction stat value only to the
	// first unroll part.
	Value *StartVal = (Part == 0) ? VectorStart : Identity;
	cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
	cast<PHINode>(VecRdxPhi)
	->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
	}

	// Before each round, move the insertion point right between
	// the PHIs and the values we are going to write.
	// This allows us to write both PHINodes and the extractelement
	// instructions.
	Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

	setDebugLocFromInst(Builder, LoopExitInst);

	// If the vector reduction can be performed in a smaller type, we truncate
	// then extend the loop exit value to enable InstCombine to evaluate the
	// entire expression in the smaller type.
	if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
	Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
	Builder.SetInsertPoint(
	LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
	VectorParts RdxParts(UF);
	for (unsigned Part = 0; Part < UF; ++Part) {
	RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
	Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
	Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
	: Builder.CreateZExt(Trunc, VecTy);
	for (Value::user_iterator UI = RdxParts[Part]->user_begin();
	UI != RdxParts[Part]->user_end();)
	if (*UI != Trunc) {
	(*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
	RdxParts[Part] = Extnd;
	} else {
	++UI;
	}
	}
	Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
	for (unsigned Part = 0; Part < UF; ++Part) {
	RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
	VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
	}
	}

	// Reduce all of the unrolled parts into a single vector.
	Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
	unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
	setDebugLocFromInst(Builder, ReducedPartRdx);
	for (unsigned Part = 1; Part < UF; ++Part) {
	Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
	if (Op != Instruction::ICmp && Op != Instruction::FCmp)
	// Floating point operations had to be 'fast' to enable the reduction.
	ReducedPartRdx = addFastMathFlag(
	Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
	ReducedPartRdx, "bin.rdx"));
	else
	ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
	Builder, MinMaxKind, ReducedPartRdx, RdxPart);
	}

	if (VF > 1) {
	bool NoNaN = Legal->hasFunNoNaNAttr();
	ReducedPartRdx =
	createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
	// If the reduction can be performed in a smaller type, we need to extend
	// the reduction to the wider type before we branch to the original loop.
	if (Phi->getType() != RdxDesc.getRecurrenceType())
	ReducedPartRdx =
	RdxDesc.isSigned()
	? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
	: Builder.CreateZExt(ReducedPartRdx, Phi->getType());
	}

	// Create a phi node that merges control-flow from the backedge-taken check
	// block and the middle block.
	PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
	LoopScalarPreHeader->getTerminator());
	for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
	BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
	BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

	// Now, we need to fix the users of the reduction variable
	// inside and outside of the scalar remainder loop.
	// We know that the loop is in LCSSA form. We need to update the
	// PHI nodes in the exit blocks.
	for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
	// All PHINodes need to have a single entry edge, or two if
	// we already fixed them.
	assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");

	// We found a reduction value exit-PHI. Update it with the
	// incoming bypass edge.
	if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
	LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
	} // end of the LCSSA phi scan.

	// Fix the scalar loop reduction variable with the incoming reduction sum
	// from the vector body and from the backedge value.
	int IncomingEdgeBlockIdx =
	Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
	assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
	// Pick the other block.
	int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
	Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
	Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
	}

	void InnerLoopVectorizer::fixLCSSAPHIs() {
	for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
	if (LCSSAPhi.getNumIncomingValues() == 1) {
	assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
	"Incoming value isn't loop invariant");
	LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
	}
	}
	}

	void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
	// The basic block and loop containing the predicated instruction.
	auto *PredBB = PredInst->getParent();
	auto *VectorLoop = LI->getLoopFor(PredBB);

	// Initialize a worklist with the operands of the predicated instruction.
	SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());

	// Holds instructions that we need to analyze again. An instruction may be
	// reanalyzed if we don't yet know if we can sink it or not.
	SmallVector<Instruction *, 8> InstsToReanalyze;

	// Returns true if a given use occurs in the predicated block. Phi nodes use
	// their operands in their corresponding predecessor blocks.
	auto isBlockOfUsePredicated = [&](Use &U) -> bool {
	auto *I = cast<Instruction>(U.getUser());
	BasicBlock *BB = I->getParent();
	if (auto *Phi = dyn_cast<PHINode>(I))
	BB = Phi->getIncomingBlock(
	PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
	return BB == PredBB;
	};

	// Iteratively sink the scalarized operands of the predicated instruction
	// into the block we created for it. When an instruction is sunk, it's
	// operands are then added to the worklist. The algorithm ends after one pass
	// through the worklist doesn't sink a single instruction.
	bool Changed;
	do {
	// Add the instructions that need to be reanalyzed to the worklist, and
	// reset the changed indicator.
	Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
	InstsToReanalyze.clear();
	Changed = false;

	while (!Worklist.empty()) {
	auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());

	// We can't sink an instruction if it is a phi node, is already in the
	// predicated block, is not in the loop, or may have side effects.
	if (!I \|\| isa<PHINode>(I) \|\| I->getParent() == PredBB \|\|
	!VectorLoop->contains(I) \|\| I->mayHaveSideEffects())
	continue;

	// It's legal to sink the instruction if all its uses occur in the
	// predicated block. Otherwise, there's nothing to do yet, and we may
	// need to reanalyze the instruction.
	if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
	InstsToReanalyze.push_back(I);
	continue;
	}

	// Move the instruction to the beginning of the predicated block, and add
	// it's operands to the worklist.
	I->moveBefore(&*PredBB->getFirstInsertionPt());
	Worklist.insert(I->op_begin(), I->op_end());

	// The sinking may have enabled other instructions to be sunk, so we will
	// need to iterate.
	Changed = true;
	}
	} while (Changed);
	}

	void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
	unsigned VF) {
	assert(PN->getParent() == OrigLoop->getHeader() &&
	"Non-header phis should have been handled elsewhere");

	PHINode *P = cast<PHINode>(PN);
	// In order to support recurrences we need to be able to vectorize Phi nodes.
	// Phi nodes have cycles, so we need to vectorize them in two stages. This is
	// stage #1: We create a new vector PHI node with no incoming edges. We'll use
	// this value when we vectorize all of the instructions that use the PHI.
	if (Legal->isReductionVariable(P) \|\| Legal->isFirstOrderRecurrence(P)) {
	for (unsigned Part = 0; Part < UF; ++Part) {
	// This is phase one of vectorizing PHIs.
	Type *VecTy =
	(VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
	Value *EntryPart = PHINode::Create(
	VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
	VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
	}
	return;
	}

	setDebugLocFromInst(Builder, P);

	// This PHINode must be an induction variable.
	// Make sure that we know about it.
	assert(Legal->getInductionVars()->count(P) && "Not an induction variable");

	InductionDescriptor II = Legal->getInductionVars()->lookup(P);
	const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

	// FIXME: The newly created binary instructions should contain nsw/nuw flags,
	// which can be found from the original scalar operations.
	switch (II.getKind()) {
	case InductionDescriptor::IK_NoInduction:
	llvm_unreachable("Unknown induction");
	case InductionDescriptor::IK_IntInduction:
	case InductionDescriptor::IK_FpInduction:
	llvm_unreachable("Integer/fp induction is handled elsewhere.");
	case InductionDescriptor::IK_PtrInduction: {
	// Handle the pointer induction variable case.
	assert(P->getType()->isPointerTy() && "Unexpected type.");
	// This is the normalized GEP that starts counting at zero.
	Value *PtrInd = Induction;
	PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
	// Determine the number of scalars we need to generate for each unroll
	// iteration. If the instruction is uniform, we only need to generate the
	// first lane. Otherwise, we generate all VF values.
	unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
	// These are the scalar results. Notice that we don't generate vector GEPs
	// because scalar GEPs result in better code.
	for (unsigned Part = 0; Part < UF; ++Part) {
	for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
	Constant Idx = ConstantInt::get(PtrInd->getType(), Lane + Part VF);
	Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
	Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
	SclrGep->setName("next.gep");
	VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
	}
	}
	return;
	}
	}
	}

	/// A helper function for checking whether an integer division-related
	/// instruction may divide by zero (in which case it must be predicated if
	/// executed conditionally in the scalar code).
	/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
	/// Non-zero divisors that are non compile-time constants will not be
	/// converted into multiplication, so we will still end up scalarizing
	/// the division, but can do so w/o predication.
	static bool mayDivideByZero(Instruction &I) {
	assert((I.getOpcode() == Instruction::UDiv \|\|
	I.getOpcode() == Instruction::SDiv \|\|
	I.getOpcode() == Instruction::URem \|\|
	I.getOpcode() == Instruction::SRem) &&
	"Unexpected instruction");
	Value *Divisor = I.getOperand(1);
	auto *CInt = dyn_cast<ConstantInt>(Divisor);
	return !CInt \|\| CInt->isZero();
	}

	void InnerLoopVectorizer::widenInstruction(Instruction &I) {
	switch (I.getOpcode()) {
	case Instruction::Br:
	case Instruction::PHI:
	llvm_unreachable("This instruction is handled by a different recipe.");
	case Instruction::GetElementPtr: {
	// Construct a vector GEP by widening the operands of the scalar GEP as
	// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
	// results in a vector of pointers when at least one operand of the GEP
	// is vector-typed. Thus, to keep the representation compact, we only use
	// vector-typed operands for loop-varying values.
	auto *GEP = cast<GetElementPtrInst>(&I);

	if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
	// If we are vectorizing, but the GEP has only loop-invariant operands,
	// the GEP we build (by only using vector-typed operands for
	// loop-varying values) would be a scalar pointer. Thus, to ensure we
	// produce a vector of pointers, we need to either arbitrarily pick an
	// operand to broadcast, or broadcast a clone of the original GEP.
	// Here, we broadcast a clone of the original.
	//
	// TODO: If at some point we decide to scalarize instructions having
	// loop-invariant operands, this special case will no longer be
	// required. We would add the scalarization decision to
	// collectLoopScalars() and teach getVectorValue() to broadcast
	// the lane-zero scalar value.
	auto *Clone = Builder.Insert(GEP->clone());
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
	VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
	addMetadata(EntryPart, GEP);
	}
	} else {
	// If the GEP has at least one loop-varying operand, we are sure to
	// produce a vector of pointers. But if we are only unrolling, we want
	// to produce a scalar GEP for each unroll part. Thus, the GEP we
	// produce with the code below will be scalar (if VF == 1) or vector
	// (otherwise). Note that for the unroll-only case, we still maintain
	// values in the vector mapping with initVector, as we do for other
	// instructions.
	for (unsigned Part = 0; Part < UF; ++Part) {
	// The pointer operand of the new GEP. If it's loop-invariant, we
	// won't broadcast it.
	auto *Ptr =
	OrigLoop->isLoopInvariant(GEP->getPointerOperand())
	? GEP->getPointerOperand()
	: getOrCreateVectorValue(GEP->getPointerOperand(), Part);

	// Collect all the indices for the new GEP. If any index is
	// loop-invariant, we won't broadcast it.
	SmallVector<Value *, 4> Indices;
	for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
	if (OrigLoop->isLoopInvariant(U.get()))
	Indices.push_back(U.get());
	else
	Indices.push_back(getOrCreateVectorValue(U.get(), Part));
	}

	// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
	// but it should be a vector, otherwise.
	auto *NewGEP = GEP->isInBounds()
	? Builder.CreateInBoundsGEP(Ptr, Indices)
	: Builder.CreateGEP(Ptr, Indices);
	assert((VF == 1 \|\| NewGEP->getType()->isVectorTy()) &&
	"NewGEP is not a pointer vector");
	VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
	addMetadata(NewGEP, GEP);
	}
	}

	break;
	}
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::SRem:
	case Instruction::URem:
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::FDiv:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	// Just widen binops.
	auto *BinOp = cast<BinaryOperator>(&I);
	setDebugLocFromInst(Builder, BinOp);

	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
	Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
	Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);

	if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
	VecOp->copyIRFlags(BinOp);

	// Use this vector value for all users of the original instruction.
	VectorLoopValueMap.setVectorValue(&I, Part, V);
	addMetadata(V, BinOp);
	}

	break;
	}
	case Instruction::Select: {
	// Widen selects.
	// If the selector is loop invariant we can create a select
	// instruction with a scalar condition. Otherwise, use vector-select.
	auto *SE = PSE.getSE();
	bool InvariantCond =
	SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
	setDebugLocFromInst(Builder, &I);

	// The condition can be loop invariant but still defined inside the
	// loop. This means that we can't just use the original 'cond' value.
	// We have to take the 'vectorized' value and pick the first lane.
	// Instcombine will make this a no-op.

	auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});

	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
	Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
	Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
	Value *Sel =
	Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
	VectorLoopValueMap.setVectorValue(&I, Part, Sel);
	addMetadata(Sel, &I);
	}

	break;
	}

	case Instruction::ICmp:
	case Instruction::FCmp: {
	// Widen compares. Generate vector compares.
	bool FCmp = (I.getOpcode() == Instruction::FCmp);
	auto *Cmp = dyn_cast<CmpInst>(&I);
	setDebugLocFromInst(Builder, Cmp);
	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
	Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
	Value *C = nullptr;
	if (FCmp) {
	// Propagate fast math flags.
	IRBuilder<>::FastMathFlagGuard FMFG(Builder);
	Builder.setFastMathFlags(Cmp->getFastMathFlags());
	C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
	} else {
	C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
	}
	VectorLoopValueMap.setVectorValue(&I, Part, C);
	addMetadata(C, &I);
	}

	break;
	}

	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	auto *CI = dyn_cast<CastInst>(&I);
	setDebugLocFromInst(Builder, CI);

	/// Vectorize casts.
	Type *DestTy =
	(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);

	for (unsigned Part = 0; Part < UF; ++Part) {
	Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
	Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
	VectorLoopValueMap.setVectorValue(&I, Part, Cast);
	addMetadata(Cast, &I);
	}
	break;
	}

	case Instruction::Call: {
	// Ignore dbg intrinsics.
	if (isa<DbgInfoIntrinsic>(I))
	break;
	setDebugLocFromInst(Builder, &I);

	Module *M = I.getParent()->getParent()->getParent();
	auto *CI = cast<CallInst>(&I);

	StringRef FnName = CI->getCalledFunction()->getName();
	Function *F = CI->getCalledFunction();
	Type *RetTy = ToVectorTy(CI->getType(), VF);
	SmallVector<Type *, 4> Tys;
	for (Value *ArgOperand : CI->arg_operands())
	Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	// The flag shows whether we use Intrinsic or a usual Call for vectorized
	// version of the instruction.
	// Is it beneficial to perform intrinsic call compared to lib call?
	bool NeedToScalarize;
	unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
	bool UseVectorIntrinsic =
	ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
	assert((UseVectorIntrinsic \|\| !NeedToScalarize) &&
	"Instruction should be scalarized elsewhere.");

	for (unsigned Part = 0; Part < UF; ++Part) {
	SmallVector<Value *, 4> Args;
	for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
	Value *Arg = CI->getArgOperand(i);
	// Some intrinsics have a scalar argument - don't replace it with a
	// vector.
	if (!UseVectorIntrinsic \|\| !hasVectorInstrinsicScalarOpd(ID, i))
	Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
	Args.push_back(Arg);
	}

	Function *VectorF;
	if (UseVectorIntrinsic) {
	// Use vector version of the intrinsic.
	Type *TysForDecl[] = {CI->getType()};
	if (VF > 1)
	TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
	VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
	} else {
	// Use vector version of the library call.
	StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
	assert(!VFnName.empty() && "Vector function name is empty.");
	VectorF = M->getFunction(VFnName);
	if (!VectorF) {
	// Generate a declaration
	FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
	VectorF =
	Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
	VectorF->copyAttributesFrom(F);
	}
	}
	assert(VectorF && "Can't create vector function.");

	SmallVector<OperandBundleDef, 1> OpBundles;
	CI->getOperandBundlesAsDefs(OpBundles);
	CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

	if (isa<FPMathOperator>(V))
	V->copyFastMathFlags(CI);

	VectorLoopValueMap.setVectorValue(&I, Part, V);
	addMetadata(V, &I);
	}

	break;
	}

	default:
	// This instruction is not vectorized by simple widening.
	DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
	llvm_unreachable("Unhandled instruction!");
	} // end of switch.
	}

	void InnerLoopVectorizer::updateAnalysis() {
	// Forget the original basic block.
	PSE.getSE()->forgetLoop(OrigLoop);

	// Update the dominator tree information.
	assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
	"Entry does not dominate exit.");

	DT->addNewBlock(LoopMiddleBlock,
	LI->getLoopFor(LoopVectorBody)->getLoopLatch());
	DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
	DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
	DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
	DEBUG(DT->verifyDomTree());
	}

	/// \brief Check whether it is safe to if-convert this phi node.
	///
	/// Phi nodes with constant expressions that can trap are not safe to if
	/// convert.
	static bool canIfConvertPHINodes(BasicBlock *BB) {
	for (PHINode &Phi : BB->phis()) {
	for (Value *V : Phi.incoming_values())
	if (auto *C = dyn_cast<Constant>(V))
	if (C->canTrap())
	return false;
	}
	return true;
	}

	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
	if (!EnableIfConversion) {
	ORE->emit(createMissedAnalysis("IfConversionDisabled")
	<< "if-conversion is disabled");
	return false;
	}

	assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");

	// A list of pointers that we can safely read and write to.
	SmallPtrSet<Value *, 8> SafePointes;

	// Collect safe addresses.
	for (BasicBlock *BB : TheLoop->blocks()) {
	if (blockNeedsPredication(BB))
	continue;

	for (Instruction &I : *BB)
	if (auto *Ptr = getPointerOperand(&I))
	SafePointes.insert(Ptr);
	}

	// Collect the blocks that need predication.
	BasicBlock *Header = TheLoop->getHeader();
	for (BasicBlock *BB : TheLoop->blocks()) {
	// We don't support switch statements inside loops.
	if (!isa<BranchInst>(BB->getTerminator())) {
	ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
	<< "loop contains a switch statement");
	return false;
	}

	// We must be able to predicate all blocks that need to be predicated.
	if (blockNeedsPredication(BB)) {
	if (!blockCanBePredicated(BB, SafePointes)) {
	ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
	<< "control flow cannot be substituted for a select");
	return false;
	}
	} else if (BB != Header && !canIfConvertPHINodes(BB)) {
	ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
	<< "control flow cannot be substituted for a select");
	return false;
	}
	}

	// We can if-convert this loop.
	return true;
	}

	bool LoopVectorizationLegality::canVectorize() {
	// Store the result and return it at the end instead of exiting early, in case
	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
	bool Result = true;

	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
	// We must have a loop in canonical form. Loops with indirectbr in them cannot
	// be canonicalized.
	if (!TheLoop->getLoopPreheader()) {
	DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
	ORE->emit(createMissedAnalysis("CFGNotUnderstood")
	<< "loop control flow is not understood by vectorizer");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// FIXME: The code is currently dead, since the loop gets sent to
	// LoopVectorizationLegality is already an innermost loop.
	//
	// We can only vectorize innermost loops.
	if (!TheLoop->empty()) {
	ORE->emit(createMissedAnalysis("NotInnermostLoop")
	<< "loop is not the innermost loop");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// We must have a single backedge.
	if (TheLoop->getNumBackEdges() != 1) {
	ORE->emit(createMissedAnalysis("CFGNotUnderstood")
	<< "loop control flow is not understood by vectorizer");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// We must have a single exiting block.
	if (!TheLoop->getExitingBlock()) {
	ORE->emit(createMissedAnalysis("CFGNotUnderstood")
	<< "loop control flow is not understood by vectorizer");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// We only handle bottom-tested loops, i.e. loop in which the condition is
	// checked at the end of each iteration. With that we can assume that all
	// instructions in the loop are executed the same number of times.
	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
	ORE->emit(createMissedAnalysis("CFGNotUnderstood")
	<< "loop control flow is not understood by vectorizer");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// We need to have a loop header.
	DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
	<< '\n');

	// Check if we can if-convert non-single-bb loops.
	unsigned NumBlocks = TheLoop->getNumBlocks();
	if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
	DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// Check if we can vectorize the instructions and CFG in this loop.
	if (!canVectorizeInstrs()) {
	DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// Go over each instruction and look at memory deps.
	if (!canVectorizeMemory()) {
	DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	DEBUG(dbgs() << "LV: We can vectorize this loop"
	<< (LAI->getRuntimePointerChecking()->Need
	? " (with a runtime bound check)"
	: "")
	<< "!\n");

	bool UseInterleaved = TTI->enableInterleavedAccessVectorization();

	// If an override option has been passed in for interleaved accesses, use it.
	if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
	UseInterleaved = EnableInterleavedMemAccesses;

	// Analyze interleaved memory accesses.
	if (UseInterleaved)
	InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());

	unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
	if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
	SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;

	if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
	ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
	<< "Too many SCEV assumptions need to be made and checked "
	<< "at runtime");
	DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
	if (DoExtraAnalysis)
	Result = false;
	else
	return false;
	}

	// Okay! We've done all the tests. If any have failed, return false. Otherwise
	// we can vectorize, and at this point we don't have any other mem analysis
	// which may limit our maximum vectorization factor, so just return true with
	// no restrictions.
	return Result;
	}

	static Type convertPointerToIntegerType(const DataLayout &DL, Type Ty) {
	if (Ty->isPointerTy())
	return DL.getIntPtrType(Ty);

	// It is possible that char's or short's overflow when we ask for the loop's
	// trip count, work around this by changing the type size.
	if (Ty->getScalarSizeInBits() < 32)
	return Type::getInt32Ty(Ty->getContext());

	return Ty;
	}

	static Type getWiderType(const DataLayout &DL, Type Ty0, Type *Ty1) {
	Ty0 = convertPointerToIntegerType(DL, Ty0);
	Ty1 = convertPointerToIntegerType(DL, Ty1);
	if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
	return Ty0;
	return Ty1;
	}

	/// \brief Check that the instruction has outside loop users and is not an
	/// identified reduction variable.
	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
	SmallPtrSetImpl<Value *> &AllowedExit) {
	// Reduction and Induction instructions are allowed to have exit users. All
	// other instructions must not have external users.
	if (!AllowedExit.count(Inst))
	// Check that all of the users of the loop are inside the BB.
	for (User *U : Inst->users()) {
	Instruction *UI = cast<Instruction>(U);
	// This user may be a reduction exit value.
	if (!TheLoop->contains(UI)) {
	DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
	return true;
	}
	}
	return false;
	}

	void LoopVectorizationLegality::addInductionPhi(
	PHINode *Phi, const InductionDescriptor &ID,
	SmallPtrSetImpl<Value *> &AllowedExit) {
	Inductions[Phi] = ID;

	// In case this induction also comes with casts that we know we can ignore
	// in the vectorized loop body, record them here. All casts could be recorded
	// here for ignoring, but suffices to record only the first (as it is the
	// only one that may bw used outside the cast sequence).
	const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
	if (!Casts.empty())
	InductionCastsToIgnore.insert(*Casts.begin());

	Type *PhiTy = Phi->getType();
	const DataLayout &DL = Phi->getModule()->getDataLayout();

	// Get the widest type.
	if (!PhiTy->isFloatingPointTy()) {
	if (!WidestIndTy)
	WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
	else
	WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
	}

	// Int inductions are special because we only allow one IV.
	if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
	ID.getConstIntStepValue() &&
	ID.getConstIntStepValue()->isOne() &&
	isa<Constant>(ID.getStartValue()) &&
	cast<Constant>(ID.getStartValue())->isNullValue()) {

	// Use the phi node with the widest type as induction. Use the last
	// one if there are multiple (no good reason for doing this other
	// than it is expedient). We've checked that it begins at zero and
	// steps by one, so this is a canonical induction variable.
	if (!PrimaryInduction \|\| PhiTy == WidestIndTy)
	PrimaryInduction = Phi;
	}

	// Both the PHI node itself, and the "post-increment" value feeding
	// back into the PHI node may have external users.
	// We can allow those uses, except if the SCEVs we have for them rely
	// on predicates that only hold within the loop, since allowing the exit
	// currently means re-using this SCEV outside the loop.
	if (PSE.getUnionPredicate().isAlwaysTrue()) {
	AllowedExit.insert(Phi);
	AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
	}

	DEBUG(dbgs() << "LV: Found an induction variable.\n");
	}

	bool LoopVectorizationLegality::canVectorizeInstrs() {
	BasicBlock *Header = TheLoop->getHeader();

	// Look for the attribute signaling the absence of NaNs.
	Function &F = *Header->getParent();
	HasFunNoNaNAttr =
	F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";

	// For each block in the loop.
	for (BasicBlock *BB : TheLoop->blocks()) {
	// Scan the instructions in the block and look for hazards.
	for (Instruction &I : *BB) {
	if (auto *Phi = dyn_cast<PHINode>(&I)) {
	Type *PhiTy = Phi->getType();
	// Check that this PHI type is allowed.
	if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
	!PhiTy->isPointerTy()) {
	ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
	<< "loop control flow is not understood by vectorizer");
	DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
	return false;
	}

	// If this PHINode is not in the header block, then we know that we
	// can convert it to select during if-conversion. No need to check if
	// the PHIs in this block are induction or reduction variables.
	if (BB != Header) {
	// Check that this instruction has no outside users or is an
	// identified reduction value with an outside user.
	if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
	continue;
	ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
	<< "value could not be identified as "
	"an induction or reduction variable");
	return false;
	}

	// We only allow if-converted PHIs with exactly two incoming values.
	if (Phi->getNumIncomingValues() != 2) {
	ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
	<< "control flow not understood by vectorizer");
	DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
	return false;
	}

	RecurrenceDescriptor RedDes;
	if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
	if (RedDes.hasUnsafeAlgebra())
	Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
	AllowedExit.insert(RedDes.getLoopExitInstr());
	Reductions[Phi] = RedDes;
	continue;
	}

	InductionDescriptor ID;
	if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
	addInductionPhi(Phi, ID, AllowedExit);
	if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
	Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
	continue;
	}

	if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
	SinkAfter, DT)) {
	FirstOrderRecurrences.insert(Phi);
	continue;
	}

	// As a last resort, coerce the PHI to a AddRec expression
	// and re-try classifying it a an induction PHI.
	if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
	addInductionPhi(Phi, ID, AllowedExit);
	continue;
	}

	ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
	<< "value that could not be identified as "
	"reduction is used outside the loop");
	DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
	return false;
	} // end of PHI handling

	// We handle calls that:
	// * Are debug info intrinsics.
	// * Have a mapping to an IR intrinsic.
	// * Have a vector version available.
	auto *CI = dyn_cast<CallInst>(&I);
	if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
	!isa<DbgInfoIntrinsic>(CI) &&
	!(CI->getCalledFunction() && TLI &&
	TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
	ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
	<< "call instruction cannot be vectorized");
	DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
	return false;
	}

	// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
	// second argument is the same (i.e. loop invariant)
	if (CI && hasVectorInstrinsicScalarOpd(
	getVectorIntrinsicIDForCall(CI, TLI), 1)) {
	auto *SE = PSE.getSE();
	if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
	ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
	<< "intrinsic instruction cannot be vectorized");
	DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
	return false;
	}
	}

	// Check that the instruction return type is vectorizable.
	// Also, we can't vectorize extractelement instructions.
	if ((!VectorType::isValidElementType(I.getType()) &&
	!I.getType()->isVoidTy()) \|\|
	isa<ExtractElementInst>(I)) {
	ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
	<< "instruction return type cannot be vectorized");
	DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
	return false;
	}

	// Check that the stored type is vectorizable.
	if (auto *ST = dyn_cast<StoreInst>(&I)) {
	Type *T = ST->getValueOperand()->getType();
	if (!VectorType::isValidElementType(T)) {
	ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
	<< "store instruction cannot be vectorized");
	return false;
	}

	// FP instructions can allow unsafe algebra, thus vectorizable by
	// non-IEEE-754 compliant SIMD units.
	// This applies to floating-point math operations and calls, not memory
	// operations, shuffles, or casts, as they don't change precision or
	// semantics.
	} else if (I.getType()->isFloatingPointTy() && (CI \|\| I.isBinaryOp()) &&
	!I.isFast()) {
	DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
	Hints->setPotentiallyUnsafe();
	}

	// Reduction instructions are allowed to have exit users.
	// All other instructions must not have external users.
	if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
	ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
	<< "value cannot be used outside the loop");
	return false;
	}
	} // next instr.
	}

	if (!PrimaryInduction) {
	DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
	if (Inductions.empty()) {
	ORE->emit(createMissedAnalysis("NoInductionVariable")
	<< "loop induction variable could not be identified");
	return false;
	}
	}

	// Now we know the widest induction type, check if our found induction
	// is the same size. If it's not, unset it here and InnerLoopVectorizer
	// will create another.
	if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
	PrimaryInduction = nullptr;

	return true;
	}

	void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
	// We should not collect Scalars more than once per VF. Right now, this
	// function is called from collectUniformsAndScalars(), which already does
	// this check. Collecting Scalars for VF=1 does not make any sense.
	assert(VF >= 2 && !Scalars.count(VF) &&
	"This function should not be visited twice for the same VF");

	SmallSetVector<Instruction *, 8> Worklist;

	// These sets are used to seed the analysis with pointers used by memory
	// accesses that will remain scalar.
	SmallSetVector<Instruction *, 8> ScalarPtrs;
	SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;

	// A helper that returns true if the use of Ptr by MemAccess will be scalar.
	// The pointer operands of loads and stores will be scalar as long as the
	// memory access is not a gather or scatter operation. The value operand of a
	// store will remain scalar if the store is scalarized.
	auto isScalarUse = [&](Instruction MemAccess, Value Ptr) {
	InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
	assert(WideningDecision != CM_Unknown &&
	"Widening decision should be ready at this moment");
	if (auto *Store = dyn_cast<StoreInst>(MemAccess))
	if (Ptr == Store->getValueOperand())
	return WideningDecision == CM_Scalarize;
	assert(Ptr == getPointerOperand(MemAccess) &&
	"Ptr is neither a value or pointer operand");
	return WideningDecision != CM_GatherScatter;
	};

	// A helper that returns true if the given value is a bitcast or
	// getelementptr instruction contained in the loop.
	auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
	return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) \|\|
	isa<GetElementPtrInst>(V)) &&
	!TheLoop->isLoopInvariant(V);
	};

	// A helper that evaluates a memory access's use of a pointer. If the use
	// will be a scalar use, and the pointer is only used by memory accesses, we
	// place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
	// PossibleNonScalarPtrs.
	auto evaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
	// We only care about bitcast and getelementptr instructions contained in
	// the loop.
	if (!isLoopVaryingBitCastOrGEP(Ptr))
	return;

	// If the pointer has already been identified as scalar (e.g., if it was
	// also identified as uniform), there's nothing to do.
	auto *I = cast<Instruction>(Ptr);
	if (Worklist.count(I))
	return;

	// If the use of the pointer will be a scalar use, and all users of the
	// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
	// place the pointer in PossibleNonScalarPtrs.
	if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
	return isa<LoadInst>(U) \|\| isa<StoreInst>(U);
	}))
	ScalarPtrs.insert(I);
	else
	PossibleNonScalarPtrs.insert(I);
	};

	// We seed the scalars analysis with three classes of instructions: (1)
	// instructions marked uniform-after-vectorization, (2) bitcast and
	// getelementptr instructions used by memory accesses requiring a scalar use,
	// and (3) pointer induction variables and their update instructions (we
	// currently only scalarize these).
	//
	// (1) Add to the worklist all instructions that have been identified as
	// uniform-after-vectorization.
	Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());

	// (2) Add to the worklist all bitcast and getelementptr instructions used by
	// memory accesses requiring a scalar use. The pointer operands of loads and
	// stores will be scalar as long as the memory accesses is not a gather or
	// scatter operation. The value operand of a store will remain scalar if the
	// store is scalarized.
	for (auto *BB : TheLoop->blocks())
	for (auto &I : *BB) {
	if (auto *Load = dyn_cast<LoadInst>(&I)) {
	evaluatePtrUse(Load, Load->getPointerOperand());
	} else if (auto *Store = dyn_cast<StoreInst>(&I)) {
	evaluatePtrUse(Store, Store->getPointerOperand());
	evaluatePtrUse(Store, Store->getValueOperand());
	}
	}
	for (auto *I : ScalarPtrs)
	if (!PossibleNonScalarPtrs.count(I)) {
	DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
	Worklist.insert(I);
	}

	// (3) Add to the worklist all pointer induction variables and their update
	// instructions.
	//
	// TODO: Once we are able to vectorize pointer induction variables we should
	// no longer insert them into the worklist here.
	auto *Latch = TheLoop->getLoopLatch();
	for (auto &Induction : *Legal->getInductionVars()) {
	auto *Ind = Induction.first;
	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
	if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
	continue;
	Worklist.insert(Ind);
	Worklist.insert(IndUpdate);
	DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
	DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
	}

	// Insert the forced scalars.
	// FIXME: Currently widenPHIInstruction() often creates a dead vector
	// induction variable when the PHI user is scalarized.
	if (ForcedScalars.count(VF))
	for (auto *I : ForcedScalars.find(VF)->second)
	Worklist.insert(I);

	// Expand the worklist by looking through any bitcasts and getelementptr
	// instructions we've already identified as scalar. This is similar to the
	// expansion step in collectLoopUniforms(); however, here we're only
	// expanding to include additional bitcasts and getelementptr instructions.
	unsigned Idx = 0;
	while (Idx != Worklist.size()) {
	Instruction *Dst = Worklist[Idx++];
	if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
	continue;
	auto *Src = cast<Instruction>(Dst->getOperand(0));
	if (llvm::all_of(Src->users(), [&](User *U) -> bool {
	auto *J = cast<Instruction>(U);
	return !TheLoop->contains(J) \|\| Worklist.count(J) \|\|
	((isa<LoadInst>(J) \|\| isa<StoreInst>(J)) &&
	isScalarUse(J, Src));
	})) {
	Worklist.insert(Src);
	DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
	}
	}

	// An induction variable will remain scalar if all users of the induction
	// variable and induction variable update remain scalar.
	for (auto &Induction : *Legal->getInductionVars()) {
	auto *Ind = Induction.first;
	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

	// We already considered pointer induction variables, so there's no reason
	// to look at their users again.
	//
	// TODO: Once we are able to vectorize pointer induction variables we
	// should no longer skip over them here.
	if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
	continue;

	// Determine if all users of the induction variable are scalar after
	// vectorization.
	auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
	auto *I = cast<Instruction>(U);
	return I == IndUpdate \|\| !TheLoop->contains(I) \|\| Worklist.count(I);
	});
	if (!ScalarInd)
	continue;

	// Determine if all users of the induction variable update instruction are
	// scalar after vectorization.
	auto ScalarIndUpdate =
	llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
	auto *I = cast<Instruction>(U);
	return I == Ind \|\| !TheLoop->contains(I) \|\| Worklist.count(I);
	});
	if (!ScalarIndUpdate)
	continue;

	// The induction variable and its update instruction will remain scalar.
	Worklist.insert(Ind);
	Worklist.insert(IndUpdate);
	DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
	DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
	}

	Scalars[VF].insert(Worklist.begin(), Worklist.end());
	}

	bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
	if (!blockNeedsPredication(I->getParent()))
	return false;
	switch(I->getOpcode()) {
	default:
	break;
	case Instruction::Store:
	return !isMaskRequired(I);
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::SRem:
	case Instruction::URem:
	return mayDivideByZero(*I);
	}
	return false;
	}

	bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
	unsigned VF) {
	// Get and ensure we have a valid memory instruction.
	LoadInst *LI = dyn_cast<LoadInst>(I);
	StoreInst *SI = dyn_cast<StoreInst>(I);
	assert((LI \|\| SI) && "Invalid memory instruction");

	auto *Ptr = getPointerOperand(I);

	// In order to be widened, the pointer should be consecutive, first of all.
	if (!isConsecutivePtr(Ptr))
	return false;

	// If the instruction is a store located in a predicated block, it will be
	// scalarized.
	if (isScalarWithPredication(I))
	return false;

	// If the instruction's allocated size doesn't equal it's type size, it
	// requires padding and will be scalarized.
	auto &DL = I->getModule()->getDataLayout();
	auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
	if (hasIrregularType(ScalarTy, DL, VF))
	return false;

	return true;
	}

	void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
	// We should not collect Uniforms more than once per VF. Right now,
	// this function is called from collectUniformsAndScalars(), which
	// already does this check. Collecting Uniforms for VF=1 does not make any
	// sense.

	assert(VF >= 2 && !Uniforms.count(VF) &&
	"This function should not be visited twice for the same VF");

	// Visit the list of Uniforms. If we'll not find any uniform value, we'll
	// not analyze again. Uniforms.count(VF) will return 1.
	Uniforms[VF].clear();

	// We now know that the loop is vectorizable!
	// Collect instructions inside the loop that will remain uniform after
	// vectorization.

	// Global values, params and instructions outside of current loop are out of
	// scope.
	auto isOutOfScope = [&](Value *V) -> bool {
	Instruction *I = dyn_cast<Instruction>(V);
	return (!I \|\| !TheLoop->contains(I));
	};

	SetVector<Instruction *> Worklist;
	BasicBlock *Latch = TheLoop->getLoopLatch();

	// Start with the conditional branch. If the branch condition is an
	// instruction contained in the loop that is only used by the branch, it is
	// uniform.
	auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
	if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
	Worklist.insert(Cmp);
	DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
	}

	// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
	// are pointers that are treated like consecutive pointers during
	// vectorization. The pointer operands of interleaved accesses are an
	// example.
	SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;

	// Holds pointer operands of instructions that are possibly non-uniform.
	SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;

	auto isUniformDecision = [&](Instruction *I, unsigned VF) {
	InstWidening WideningDecision = getWideningDecision(I, VF);
	assert(WideningDecision != CM_Unknown &&
	"Widening decision should be ready at this moment");

	return (WideningDecision == CM_Widen \|\|
	WideningDecision == CM_Widen_Reverse \|\|
	WideningDecision == CM_Interleave);
	};
	// Iterate over the instructions in the loop, and collect all
	// consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
	// that a consecutive-like pointer operand will be scalarized, we collect it
	// in PossibleNonUniformPtrs instead. We use two sets here because a single
	// getelementptr instruction can be used by both vectorized and scalarized
	// memory instructions. For example, if a loop loads and stores from the same
	// location, but the store is conditional, the store will be scalarized, and
	// the getelementptr won't remain uniform.
	for (auto *BB : TheLoop->blocks())
	for (auto &I : *BB) {
	// If there's no pointer operand, there's nothing to do.
	auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
	if (!Ptr)
	continue;

	// True if all users of Ptr are memory accesses that have Ptr as their
	// pointer operand.
	auto UsersAreMemAccesses =
	llvm::all_of(Ptr->users(), [&](User *U) -> bool {
	return getPointerOperand(U) == Ptr;
	});

	// Ensure the memory instruction will not be scalarized or used by
	// gather/scatter, making its pointer operand non-uniform. If the pointer
	// operand is used by any instruction other than a memory access, we
	// conservatively assume the pointer operand may be non-uniform.
	if (!UsersAreMemAccesses \|\| !isUniformDecision(&I, VF))
	PossibleNonUniformPtrs.insert(Ptr);

	// If the memory instruction will be vectorized and its pointer operand
	// is consecutive-like, or interleaving - the pointer operand should
	// remain uniform.
	else
	ConsecutiveLikePtrs.insert(Ptr);
	}

	// Add to the Worklist all consecutive and consecutive-like pointers that
	// aren't also identified as possibly non-uniform.
	for (auto *V : ConsecutiveLikePtrs)
	if (!PossibleNonUniformPtrs.count(V)) {
	DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
	Worklist.insert(V);
	}

	// Expand Worklist in topological order: whenever a new instruction
	// is added , its users should be either already inside Worklist, or
	// out of scope. It ensures a uniform instruction will only be used
	// by uniform instructions or out of scope instructions.
	unsigned idx = 0;
	while (idx != Worklist.size()) {
	Instruction *I = Worklist[idx++];

	for (auto OV : I->operand_values()) {
	if (isOutOfScope(OV))
	continue;
	auto *OI = cast<Instruction>(OV);
	if (llvm::all_of(OI->users(), [&](User *U) -> bool {
	auto *J = cast<Instruction>(U);
	return !TheLoop->contains(J) \|\| Worklist.count(J) \|\|
	(OI == getPointerOperand(J) && isUniformDecision(J, VF));
	})) {
	Worklist.insert(OI);
	DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
	}
	}
	}

	// Returns true if Ptr is the pointer operand of a memory access instruction
	// I, and I is known to not require scalarization.
	auto isVectorizedMemAccessUse = [&](Instruction I, Value Ptr) -> bool {
	return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
	};

	// For an instruction to be added into Worklist above, all its users inside
	// the loop should also be in Worklist. However, this condition cannot be
	// true for phi nodes that form a cyclic dependence. We must process phi
	// nodes separately. An induction variable will remain uniform if all users
	// of the induction variable and induction variable update remain uniform.
	// The code below handles both pointer and non-pointer induction variables.
	for (auto &Induction : *Legal->getInductionVars()) {
	auto *Ind = Induction.first;
	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

	// Determine if all users of the induction variable are uniform after
	// vectorization.
	auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
	auto *I = cast<Instruction>(U);
	return I == IndUpdate \|\| !TheLoop->contains(I) \|\| Worklist.count(I) \|\|
	isVectorizedMemAccessUse(I, Ind);
	});
	if (!UniformInd)
	continue;

	// Determine if all users of the induction variable update instruction are
	// uniform after vectorization.
	auto UniformIndUpdate =
	llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
	auto *I = cast<Instruction>(U);
	return I == Ind \|\| !TheLoop->contains(I) \|\| Worklist.count(I) \|\|
	isVectorizedMemAccessUse(I, IndUpdate);
	});
	if (!UniformIndUpdate)
	continue;

	// The induction variable and its update instruction will remain uniform.
	Worklist.insert(Ind);
	Worklist.insert(IndUpdate);
	DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
	DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
	}

	Uniforms[VF].insert(Worklist.begin(), Worklist.end());
	}

	bool LoopVectorizationLegality::canVectorizeMemory() {
	LAI = &(GetLAA)(TheLoop);
	InterleaveInfo.setLAI(LAI);
	const OptimizationRemarkAnalysis *LAR = LAI->getReport();
	if (LAR) {
	ORE->emit([&]() {
	return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
	"loop not vectorized: ", *LAR);
	});
	}
	if (!LAI->canVectorizeMemory())
	return false;

	if (LAI->hasStoreToLoopInvariantAddress()) {
	ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
	<< "write to a loop invariant address could not be vectorized");
	DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
	return false;
	}

	Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
	PSE.addPredicate(LAI->getPSE().getUnionPredicate());

	return true;
	}

	bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
	Value In0 = const_cast<Value >(V);
	PHINode *PN = dyn_cast_or_null<PHINode>(In0);
	if (!PN)
	return false;

	return Inductions.count(PN);
	}

	bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
	auto *Inst = dyn_cast<Instruction>(V);
	return (Inst && InductionCastsToIgnore.count(Inst));
	}

	bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
	return isInductionPhi(V) \|\| isCastedInductionVariable(V);
	}

	bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
	return FirstOrderRecurrences.count(Phi);
	}

	bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
	}

	bool LoopVectorizationLegality::blockCanBePredicated(
	BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs) {
	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();

	for (Instruction &I : *BB) {
	// Check that we don't have a constant expression that can trap as operand.
	for (Value *Operand : I.operands()) {
	if (auto *C = dyn_cast<Constant>(Operand))
	if (C->canTrap())
	return false;
	}
	// We might be able to hoist the load.
	if (I.mayReadFromMemory()) {
	auto *LI = dyn_cast<LoadInst>(&I);
	if (!LI)
	return false;
	if (!SafePtrs.count(LI->getPointerOperand())) {
	if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) \|\|
	isLegalMaskedGather(LI->getType())) {
	MaskedOp.insert(LI);
	continue;
	}
	// !llvm.mem.parallel_loop_access implies if-conversion safety.
	if (IsAnnotatedParallel)
	continue;
	return false;
	}
	}

	if (I.mayWriteToMemory()) {
	auto *SI = dyn_cast<StoreInst>(&I);
	// We only support predication of stores in basic blocks with one
	// predecessor.
	if (!SI)
	return false;

	// Build a masked store if it is legal for the target.
	if (isLegalMaskedStore(SI->getValueOperand()->getType(),
	SI->getPointerOperand()) \|\|
	isLegalMaskedScatter(SI->getValueOperand()->getType())) {
	MaskedOp.insert(SI);
	continue;
	}

	bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
	bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();

	if (++NumPredStores > NumberOfStoresToPredicate \|\| !isSafePtr \|\|
	!isSinglePredecessor)
	return false;
	}
	if (I.mayThrow())
	return false;
	}

	return true;
	}

	void InterleavedAccessInfo::collectConstStrideAccesses(
	MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
	const ValueToValueMap &Strides) {
	auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();

	// Since it's desired that the load/store instructions be maintained in
	// "program order" for the interleaved access analysis, we have to visit the
	// blocks in the loop in reverse postorder (i.e., in a topological order).
	// Such an ordering will ensure that any load/store that may be executed
	// before a second load/store will precede the second load/store in
	// AccessStrideInfo.
	LoopBlocksDFS DFS(TheLoop);
	DFS.perform(LI);
	for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
	for (auto &I : *BB) {
	auto *LI = dyn_cast<LoadInst>(&I);
	auto *SI = dyn_cast<StoreInst>(&I);
	if (!LI && !SI)
	continue;

	Value *Ptr = getPointerOperand(&I);
	// We don't check wrapping here because we don't know yet if Ptr will be
	// part of a full group or a group with gaps. Checking wrapping for all
	// pointers (even those that end up in groups with no gaps) will be overly
	// conservative. For full groups, wrapping should be ok since if we would
	// wrap around the address space we would do a memory access at nullptr
	// even without the transformation. The wrapping checks are therefore
	// deferred until after we've formed the interleaved groups.
	int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
	/Assume=/true, /ShouldCheckWrap=/false);

	const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
	PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
	uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());

	// An alignment of 0 means target ABI alignment.
	unsigned Align = getMemInstAlignment(&I);
	if (!Align)
	Align = DL.getABITypeAlignment(PtrTy->getElementType());

	AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
	}
	}

	// Analyze interleaved accesses and collect them into interleaved load and
	// store groups.
	//
	// When generating code for an interleaved load group, we effectively hoist all
	// loads in the group to the location of the first load in program order. When
	// generating code for an interleaved store group, we sink all stores to the
	// location of the last store. This code motion can change the order of load
	// and store instructions and may break dependences.
	//
	// The code generation strategy mentioned above ensures that we won't violate
	// any write-after-read (WAR) dependences.
	//
	// E.g., for the WAR dependence: a = A[i]; // (1)
	// A[i] = b; // (2)
	//
	// The store group of (2) is always inserted at or below (2), and the load
	// group of (1) is always inserted at or above (1). Thus, the instructions will
	// never be reordered. All other dependences are checked to ensure the
	// correctness of the instruction reordering.
	//
	// The algorithm visits all memory accesses in the loop in bottom-up program
	// order. Program order is established by traversing the blocks in the loop in
	// reverse postorder when collecting the accesses.
	//
	// We visit the memory accesses in bottom-up order because it can simplify the
	// construction of store groups in the presence of write-after-write (WAW)
	// dependences.
	//
	// E.g., for the WAW dependence: A[i] = a; // (1)
	// A[i] = b; // (2)
	// A[i + 1] = c; // (3)
	//
	// We will first create a store group with (3) and (2). (1) can't be added to
	// this group because it and (2) are dependent. However, (1) can be grouped
	// with other accesses that may precede it in program order. Note that a
	// bottom-up order does not imply that WAW dependences should not be checked.
	void InterleavedAccessInfo::analyzeInterleaving(
	const ValueToValueMap &Strides) {
	DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");

	// Holds all accesses with a constant stride.
	MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
	collectConstStrideAccesses(AccessStrideInfo, Strides);

	if (AccessStrideInfo.empty())
	return;

	// Collect the dependences in the loop.
	collectDependences();

	// Holds all interleaved store groups temporarily.
	SmallSetVector<InterleaveGroup *, 4> StoreGroups;
	// Holds all interleaved load groups temporarily.
	SmallSetVector<InterleaveGroup *, 4> LoadGroups;

	// Search in bottom-up program order for pairs of accesses (A and B) that can
	// form interleaved load or store groups. In the algorithm below, access A
	// precedes access B in program order. We initialize a group for B in the
	// outer loop of the algorithm, and then in the inner loop, we attempt to
	// insert each A into B's group if:
	//
	// 1. A and B have the same stride,
	// 2. A and B have the same memory object size, and
	// 3. A belongs in B's group according to its distance from B.
	//
	// Special care is taken to ensure group formation will not break any
	// dependences.
	for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
	BI != E; ++BI) {
	Instruction *B = BI->first;
	StrideDescriptor DesB = BI->second;

	// Initialize a group for B if it has an allowable stride. Even if we don't
	// create a group for B, we continue with the bottom-up algorithm to ensure
	// we don't break any of B's dependences.
	InterleaveGroup *Group = nullptr;
	if (isStrided(DesB.Stride)) {
	Group = getInterleaveGroup(B);
	if (!Group) {
	DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
	Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
	}
	if (B->mayWriteToMemory())
	StoreGroups.insert(Group);
	else
	LoadGroups.insert(Group);
	}

	for (auto AI = std::next(BI); AI != E; ++AI) {
	Instruction *A = AI->first;
	StrideDescriptor DesA = AI->second;

	// Our code motion strategy implies that we can't have dependences
	// between accesses in an interleaved group and other accesses located
	// between the first and last member of the group. Note that this also
	// means that a group can't have more than one member at a given offset.
	// The accesses in a group can have dependences with other accesses, but
	// we must ensure we don't extend the boundaries of the group such that
	// we encompass those dependent accesses.
	//
	// For example, assume we have the sequence of accesses shown below in a
	// stride-2 loop:
	//
	// (1, 2) is a group \| A[i] = a; // (1)
	// \| A[i-1] = b; // (2) \|
	// A[i-3] = c; // (3)
	// A[i] = d; // (4) \| (2, 4) is not a group
	//
	// Because accesses (2) and (3) are dependent, we can group (2) with (1)
	// but not with (4). If we did, the dependent access (3) would be within
	// the boundaries of the (2, 4) group.
	if (!canReorderMemAccessesForInterleavedGroups(&AI, &BI)) {
	// If a dependence exists and A is already in a group, we know that A
	// must be a store since A precedes B and WAR dependences are allowed.
	// Thus, A would be sunk below B. We release A's group to prevent this
	// illegal code motion. A will then be free to form another group with
	// instructions that precede it.
	if (isInterleaved(A)) {
	InterleaveGroup *StoreGroup = getInterleaveGroup(A);
	StoreGroups.remove(StoreGroup);
	releaseGroup(StoreGroup);
	}

	// If a dependence exists and A is not already in a group (or it was
	// and we just released it), B might be hoisted above A (if B is a
	// load) or another store might be sunk below A (if B is a store). In
	// either case, we can't add additional instructions to B's group. B
	// will only form a group with instructions that it precedes.
	break;
	}

	// At this point, we've checked for illegal code motion. If either A or B
	// isn't strided, there's nothing left to do.
	if (!isStrided(DesA.Stride) \|\| !isStrided(DesB.Stride))
	continue;

	// Ignore A if it's already in a group or isn't the same kind of memory
	// operation as B.
	if (isInterleaved(A) \|\| A->mayReadFromMemory() != B->mayReadFromMemory())
	continue;

	// Check rules 1 and 2. Ignore A if its stride or size is different from
	// that of B.
	if (DesA.Stride != DesB.Stride \|\| DesA.Size != DesB.Size)
	continue;

	// Ignore A if the memory object of A and B don't belong to the same
	// address space
	if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
	continue;

	// Calculate the distance from A to B.
	const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
	PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
	if (!DistToB)
	continue;
	int64_t DistanceToB = DistToB->getAPInt().getSExtValue();

	// Check rule 3. Ignore A if its distance to B is not a multiple of the
	// size.
	if (DistanceToB % static_cast<int64_t>(DesB.Size))
	continue;

	// Ignore A if either A or B is in a predicated block. Although we
	// currently prevent group formation for predicated accesses, we may be
	// able to relax this limitation in the future once we handle more
	// complicated blocks.
	if (isPredicated(A->getParent()) \|\| isPredicated(B->getParent()))
	continue;

	// The index of A is the index of B plus A's distance to B in multiples
	// of the size.
	int IndexA =
	Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);

	// Try to insert A into B's group.
	if (Group->insertMember(A, IndexA, DesA.Align)) {
	DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
	<< " into the interleave group with" << *B << '\n');
	InterleaveGroupMap[A] = Group;

	// Set the first load in program order as the insert position.
	if (A->mayReadFromMemory())
	Group->setInsertPos(A);
	}
	} // Iteration over A accesses.
	} // Iteration over B accesses.

	// Remove interleaved store groups with gaps.
	for (InterleaveGroup *Group : StoreGroups)
	if (Group->getNumMembers() != Group->getFactor()) {
	DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
	"to gaps.\n");
	releaseGroup(Group);
	}
	// Remove interleaved groups with gaps (currently only loads) whose memory
	// accesses may wrap around. We have to revisit the getPtrStride analysis,
	// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
	// not check wrapping (see documentation there).
	// FORNOW we use Assume=false;
	// TODO: Change to Assume=true but making sure we don't exceed the threshold
	// of runtime SCEV assumptions checks (thereby potentially failing to
	// vectorize altogether).
	// Additional optional optimizations:
	// TODO: If we are peeling the loop and we know that the first pointer doesn't
	// wrap then we can deduce that all pointers in the group don't wrap.
	// This means that we can forcefully peel the loop in order to only have to
	// check the first pointer for no-wrap. When we'll change to use Assume=true
	// we'll only need at most one runtime check per interleaved group.
	for (InterleaveGroup *Group : LoadGroups) {
	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
	// load would wrap around the address space we would do a memory access at
	// nullptr even without the transformation.
	if (Group->getNumMembers() == Group->getFactor())
	continue;

	// Case 2: If first and last members of the group don't wrap this implies
	// that all the pointers in the group don't wrap.
	// So we check only group member 0 (which is always guaranteed to exist),
	// and group member Factor - 1; If the latter doesn't exist we rely on
	// peeling (if it is a non-reveresed accsess -- see Case 3).
	Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
	if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /Assume=/false,
	/ShouldCheckWrap=/true)) {
	DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
	"first group member potentially pointer-wrapping.\n");
	releaseGroup(Group);
	continue;
	}
	Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
	if (LastMember) {
	Value *LastMemberPtr = getPointerOperand(LastMember);
	if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /Assume=/false,
	/ShouldCheckWrap=/true)) {
	DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
	"last group member potentially pointer-wrapping.\n");
	releaseGroup(Group);
	}
	} else {
	// Case 3: A non-reversed interleaved load group with gaps: We need
	// to execute at least one scalar epilogue iteration. This will ensure
	// we don't speculatively access memory out-of-bounds. We only need
	// to look for a member at index factor - 1, since every group must have
	// a member at index zero.
	if (Group->isReverse()) {
	DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
	"a reverse access with gaps.\n");
	releaseGroup(Group);
	continue;
	}
	DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
	RequiresScalarEpilogue = true;
	}
	}
	}

	Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
	if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
	ORE->emit(createMissedAnalysis("ConditionalStore")
	<< "store that is conditionally executed prevents vectorization");
	DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
	return None;
	}

	if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
	// TODO: It may by useful to do since it's still likely to be dynamically
	// uniform if the target can skip.
	DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");

	ORE->emit(
	createMissedAnalysis("CantVersionLoopWithDivergentTarget")
	<< "runtime pointer checks needed. Not enabled for divergent target");

	return None;
	}

	unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
	if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
	return computeFeasibleMaxVF(OptForSize, TC);

	if (Legal->getRuntimePointerChecking()->Need) {
	ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
	<< "runtime pointer checks needed. Enable vectorization of this "
	"loop with '#pragma clang loop vectorize(enable)' when "
	"compiling with -Os/-Oz");
	DEBUG(dbgs()
	<< "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
	return None;
	}

	// If we optimize the program for size, avoid creating the tail loop.
	DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

	// If we don't know the precise trip count, don't try to vectorize.
	if (TC < 2) {
	ORE->emit(
	createMissedAnalysis("UnknownLoopCountComplexCFG")
	<< "unable to calculate the loop count due to complex control flow");
	DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
	return None;
	}

	unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);

	if (TC % MaxVF != 0) {
	// If the trip count that we found modulo the vectorization factor is not
	// zero then we require a tail.
	// FIXME: look for a smaller MaxVF that does divide TC rather than give up.
	// FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
	// smaller MaxVF that does not require a scalar epilog.

	ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
	<< "cannot optimize for size and vectorize at the "
	"same time. Enable vectorization of this loop "
	"with '#pragma clang loop vectorize(enable)' "
	"when compiling with -Os/-Oz");
	DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
	return None;
	}

	return MaxVF;
	}

	unsigned
	LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
	unsigned ConstTripCount) {
	MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
	unsigned SmallestType, WidestType;
	std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
	unsigned WidestRegister = TTI.getRegisterBitWidth(true);

	// Get the maximum safe dependence distance in bits computed by LAA.
	// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
	// the memory accesses that is most restrictive (involved in the smallest
	// dependence distance).
	unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();

	WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);

	unsigned MaxVectorSize = WidestRegister / WidestType;

	DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
	<< WidestType << " bits.\n");
	DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
	<< " bits.\n");

	assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
	" into one vector!");
	if (MaxVectorSize == 0) {
	DEBUG(dbgs() << "LV: The target has no vector registers.\n");
	MaxVectorSize = 1;
	return MaxVectorSize;
	} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
	isPowerOf2_32(ConstTripCount)) {
	// We need to clamp the VF to be the ConstTripCount. There is no point in
	// choosing a higher viable VF as done in the loop below.
	DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
	<< ConstTripCount << "\n");
	MaxVectorSize = ConstTripCount;
	return MaxVectorSize;
	}

	unsigned MaxVF = MaxVectorSize;
	if (MaximizeBandwidth && !OptForSize) {
	// Collect all viable vectorization factors larger than the default MaxVF
	// (i.e. MaxVectorSize).
	SmallVector<unsigned, 8> VFs;
	unsigned NewMaxVectorSize = WidestRegister / SmallestType;
	for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
	VFs.push_back(VS);

	// For each VF calculate its register usage.
	auto RUs = calculateRegisterUsage(VFs);

	// Select the largest VF which doesn't require more registers than existing
	// ones.
	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
	for (int i = RUs.size() - 1; i >= 0; --i) {
	if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
	MaxVF = VFs[i];
	break;
	}
	}
	}
	return MaxVF;
	}

	LoopVectorizationCostModel::VectorizationFactor
	LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
	float Cost = expectedCost(1).first;
	#ifndef NDEBUG
	const float ScalarCost = Cost;
	#endif /* NDEBUG */
	unsigned Width = 1;
	DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");

	bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
	// Ignore scalar width, because the user explicitly wants vectorization.
	if (ForceVectorization && MaxVF > 1) {
	Width = 2;
	Cost = expectedCost(Width).first / (float)Width;
	}

	for (unsigned i = 2; i <= MaxVF; i *= 2) {
	// Notice that the vector loop needs to be executed less times, so
	// we need to divide the cost of the vector loops by the width of
	// the vector elements.
	VectorizationCostTy C = expectedCost(i);
	float VectorCost = C.first / (float)i;
	DEBUG(dbgs() << "LV: Vector loop of width " << i
	<< " costs: " << (int)VectorCost << ".\n");
	if (!C.second && !ForceVectorization) {
	DEBUG(
	dbgs() << "LV: Not considering vector loop of width " << i
	<< " because it will not generate any vector instructions.\n");
	continue;
	}
	if (VectorCost < Cost) {
	Cost = VectorCost;
	Width = i;
	}
	}

	DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
	<< "LV: Vectorization seems to be not beneficial, "
	<< "but was forced by a user.\n");
	DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
	VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
	return Factor;
	}

	std::pair<unsigned, unsigned>
	LoopVectorizationCostModel::getSmallestAndWidestTypes() {
	unsigned MinWidth = -1U;
	unsigned MaxWidth = 8;
	const DataLayout &DL = TheFunction->getParent()->getDataLayout();

	// For each block.
	for (BasicBlock *BB : TheLoop->blocks()) {
	// For each instruction in the loop.
	for (Instruction &I : *BB) {
	Type *T = I.getType();

	// Skip ignored values.
	if (ValuesToIgnore.count(&I))
	continue;

	// Only examine Loads, Stores and PHINodes.
	if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
	continue;

	// Examine PHI nodes that are reduction variables. Update the type to
	// account for the recurrence type.
	if (auto *PN = dyn_cast<PHINode>(&I)) {
	if (!Legal->isReductionVariable(PN))
	continue;
	RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
	T = RdxDesc.getRecurrenceType();
	}

	// Examine the stored values.
	if (auto *ST = dyn_cast<StoreInst>(&I))
	T = ST->getValueOperand()->getType();

	// Ignore loaded pointer types and stored pointer types that are not
	// vectorizable.
	//
	// FIXME: The check here attempts to predict whether a load or store will
	// be vectorized. We only know this for certain after a VF has
	// been selected. Here, we assume that if an access can be
	// vectorized, it will be. We should also look at extending this
	// optimization to non-pointer types.
	//
	if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
	!Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
	continue;

	MinWidth = std::min(MinWidth,
	(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
	MaxWidth = std::max(MaxWidth,
	(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
	}
	}

	return {MinWidth, MaxWidth};
	}

	unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
	unsigned VF,
	unsigned LoopCost) {
	// -- The interleave heuristics --
	// We interleave the loop in order to expose ILP and reduce the loop overhead.
	// There are many micro-architectural considerations that we can't predict
	// at this level. For example, frontend pressure (on decode or fetch) due to
	// code size, or the number and capabilities of the execution ports.
	//
	// We use the following heuristics to select the interleave count:
	// 1. If the code has reductions, then we interleave to break the cross
	// iteration dependency.
	// 2. If the loop is really small, then we interleave to reduce the loop
	// overhead.
	// 3. We don't interleave if we think that we will spill registers to memory
	// due to the increased register pressure.

	// When we optimize for size, we don't interleave.
	if (OptForSize)
	return 1;

	// We used the distance for the interleave count.
	if (Legal->getMaxSafeDepDistBytes() != -1U)
	return 1;

	// Do not interleave loops with a relatively small trip count.
	unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
	if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
	return 1;

	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
	DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
	<< " registers\n");

	if (VF == 1) {
	if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
	TargetNumRegisters = ForceTargetNumScalarRegs;
	} else {
	if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
	TargetNumRegisters = ForceTargetNumVectorRegs;
	}

	RegisterUsage R = calculateRegisterUsage({VF})[0];
	// We divide by these constants so assume that we have at least one
	// instruction that uses at least one register.
	R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
	R.NumInstructions = std::max(R.NumInstructions, 1U);

	// We calculate the interleave count using the following formula.
	// Subtract the number of loop invariants from the number of available
	// registers. These registers are used by all of the interleaved instances.
	// Next, divide the remaining registers by the number of registers that is
	// required by the loop, in order to estimate how many parallel instances
	// fit without causing spills. All of this is rounded down if necessary to be
	// a power of two. We want power of two interleave count to simplify any
	// addressing operations or alignment considerations.
	unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
	R.MaxLocalUsers);

	// Don't count the induction variable as interleaved.
	if (EnableIndVarRegisterHeur)
	IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
	std::max(1U, (R.MaxLocalUsers - 1)));

	// Clamp the interleave ranges to reasonable counts.
	unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);

	// Check if the user has overridden the max.
	if (VF == 1) {
	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
	MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
	} else {
	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
	MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
	}

	// If we did not calculate the cost for VF (because the user selected the VF)
	// then we calculate the cost of VF here.
	if (LoopCost == 0)
	LoopCost = expectedCost(VF).first;

	// Clamp the calculated IC to be between the 1 and the max interleave count
	// that the target allows.
	if (IC > MaxInterleaveCount)
	IC = MaxInterleaveCount;
	else if (IC < 1)
	IC = 1;

	// Interleave if we vectorized this loop and there is a reduction that could
	// benefit from interleaving.
	if (VF > 1 && !Legal->getReductionVars()->empty()) {
	DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
	return IC;
	}

	// Note that if we've already vectorized the loop we will have done the
	// runtime check and so interleaving won't require further checks.
	bool InterleavingRequiresRuntimePointerCheck =
	(VF == 1 && Legal->getRuntimePointerChecking()->Need);

	// We want to interleave small loops in order to reduce the loop overhead and
	// potentially expose ILP opportunities.
	DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
	if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
	// We assume that the cost overhead is 1 and we use the cost model
	// to estimate the cost of the loop and interleave until the cost of the
	// loop overhead is about 5% of the cost of the loop.
	unsigned SmallIC =
	std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));

	// Interleave until store/load ports (estimated by max interleave count) are
	// saturated.
	unsigned NumStores = Legal->getNumStores();
	unsigned NumLoads = Legal->getNumLoads();
	unsigned StoresIC = IC / (NumStores ? NumStores : 1);
	unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);

	// If we have a scalar reduction (vector reductions are already dealt with
	// by this point), we can increase the critical path length if the loop
	// we're interleaving is inside another loop. Limit, by default to 2, so the
	// critical path only gets increased by one reduction operation.
	if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
	unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
	SmallIC = std::min(SmallIC, F);
	StoresIC = std::min(StoresIC, F);
	LoadsIC = std::min(LoadsIC, F);
	}

	if (EnableLoadStoreRuntimeInterleave &&
	std::max(StoresIC, LoadsIC) > SmallIC) {
	DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
	return std::max(StoresIC, LoadsIC);
	}

	DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
	return SmallIC;
	}

	// Interleave if this is a large loop (small loops are already dealt with by
	// this point) that could benefit from interleaving.
	bool HasReductions = !Legal->getReductionVars()->empty();
	if (TTI.enableAggressiveInterleaving(HasReductions)) {
	DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
	return IC;
	}

	DEBUG(dbgs() << "LV: Not Interleaving.\n");
	return 1;
	}

	SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
	LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
	// This function calculates the register usage by measuring the highest number
	// of values that are alive at a single location. Obviously, this is a very
	// rough estimation. We scan the loop in a topological order in order and
	// assign a number to each instruction. We use RPO to ensure that defs are
	// met before their users. We assume that each instruction that has in-loop
	// users starts an interval. We record every time that an in-loop value is
	// used, so we have a list of the first and last occurrences of each
	// instruction. Next, we transpose this data structure into a multi map that
	// holds the list of intervals that end at a specific location. This multi
	// map allows us to perform a linear search. We scan the instructions linearly
	// and record each time that a new interval starts, by placing it in a set.
	// If we find this value in the multi-map then we remove it from the set.
	// The max register usage is the maximum size of the set.
	// We also search for instructions that are defined outside the loop, but are
	// used inside the loop. We need this number separately from the max-interval
	// usage number because when we unroll, loop-invariant values do not take
	// more register.
	LoopBlocksDFS DFS(TheLoop);
	DFS.perform(LI);

	RegisterUsage RU;
	RU.NumInstructions = 0;

	// Each 'key' in the map opens a new interval. The values
	// of the map are the index of the 'last seen' usage of the
	// instruction that is the key.
	using IntervalMap = DenseMap<Instruction *, unsigned>;

	// Maps instruction to its index.
	DenseMap<unsigned, Instruction *> IdxToInstr;
	// Marks the end of each interval.
	IntervalMap EndPoint;
	// Saves the list of instruction indices that are used in the loop.
	SmallSet<Instruction *, 8> Ends;
	// Saves the list of values that are used in the loop but are
	// defined outside the loop, such as arguments and constants.
	SmallPtrSet<Value *, 8> LoopInvariants;

	unsigned Index = 0;
	for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
	RU.NumInstructions += BB->size();
	for (Instruction &I : *BB) {
	IdxToInstr[Index++] = &I;

	// Save the end location of each USE.
	for (Value *U : I.operands()) {
	auto *Instr = dyn_cast<Instruction>(U);

	// Ignore non-instruction values such as arguments, constants, etc.
	if (!Instr)
	continue;

	// If this instruction is outside the loop then record it and continue.
	if (!TheLoop->contains(Instr)) {
	LoopInvariants.insert(Instr);
	continue;
	}

	// Overwrite previous end points.
	EndPoint[Instr] = Index;
	Ends.insert(Instr);
	}
	}
	}

	// Saves the list of intervals that end with the index in 'key'.
	using InstrList = SmallVector<Instruction *, 2>;
	DenseMap<unsigned, InstrList> TransposeEnds;

	// Transpose the EndPoints to a list of values that end at each index.
	for (auto &Interval : EndPoint)
	TransposeEnds[Interval.second].push_back(Interval.first);

	SmallSet<Instruction *, 8> OpenIntervals;

	// Get the size of the widest register.
	unsigned MaxSafeDepDist = -1U;
	if (Legal->getMaxSafeDepDistBytes() != -1U)
	MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
	unsigned WidestRegister =
	std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
	const DataLayout &DL = TheFunction->getParent()->getDataLayout();

	SmallVector<RegisterUsage, 8> RUs(VFs.size());
	SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);

	DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

	// A lambda that gets the register usage for the given type and VF.
	auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
	if (Ty->isTokenTy())
	return 0U;
	unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
	return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
	};

	for (unsigned int i = 0; i < Index; ++i) {
	Instruction *I = IdxToInstr[i];

	// Remove all of the instructions that end at this location.
	InstrList &List = TransposeEnds[i];
	for (Instruction *ToRemove : List)
	OpenIntervals.erase(ToRemove);

	// Ignore instructions that are never used within the loop.
	if (!Ends.count(I))
	continue;

	// Skip ignored values.
	if (ValuesToIgnore.count(I))
	continue;

	// For each VF find the maximum usage of registers.
	for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
	if (VFs[j] == 1) {
	MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
	continue;
	}
	collectUniformsAndScalars(VFs[j]);
	// Count the number of live intervals.
	unsigned RegUsage = 0;
	for (auto Inst : OpenIntervals) {
	// Skip ignored values for VF > 1.
	if (VecValuesToIgnore.count(Inst) \|\|
	isScalarAfterVectorization(Inst, VFs[j]))
	continue;
	RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
	}
	MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
	}

	DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
	<< OpenIntervals.size() << '\n');

	// Add the current instruction to the list of open intervals.
	OpenIntervals.insert(I);
	}

	for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
	unsigned Invariant = 0;
	if (VFs[i] == 1)
	Invariant = LoopInvariants.size();
	else {
	for (auto Inst : LoopInvariants)
	Invariant += GetRegUsage(Inst->getType(), VFs[i]);
	}

	DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
	DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
	DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
	DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');

	RU.LoopInvariantRegs = Invariant;
	RU.MaxLocalUsers = MaxUsages[i];
	RUs[i] = RU;
	}

	return RUs;
	}

	void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
	// If we aren't vectorizing the loop, or if we've already collected the
	// instructions to scalarize, there's nothing to do. Collection may already
	// have occurred if we have a user-selected VF and are now computing the
	// expected cost for interleaving.
	if (VF < 2 \|\| InstsToScalarize.count(VF))
	return;

	// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
	// not profitable to scalarize any instructions, the presence of VF in the
	// map will indicate that we've analyzed it already.
	ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

	// Find all the instructions that are scalar with predication in the loop and
	// determine if it would be better to not if-convert the blocks they are in.
	// If so, we also record the instructions to scalarize.
	for (BasicBlock *BB : TheLoop->blocks()) {
	if (!Legal->blockNeedsPredication(BB))
	continue;
	for (Instruction &I : *BB)
	if (Legal->isScalarWithPredication(&I)) {
	ScalarCostsTy ScalarCosts;
	if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
	ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());

	// Remember that BB will remain after vectorization.
	PredicatedBBsAfterVectorization.insert(BB);
	}
	}
	}

	int LoopVectorizationCostModel::computePredInstDiscount(
	Instruction PredInst, DenseMap<Instruction , unsigned> &ScalarCosts,
	unsigned VF) {
	assert(!isUniformAfterVectorization(PredInst, VF) &&
	"Instruction marked uniform-after-vectorization will be predicated");

	// Initialize the discount to zero, meaning that the scalar version and the
	// vector version cost the same.
	int Discount = 0;

	// Holds instructions to analyze. The instructions we visit are mapped in
	// ScalarCosts. Those instructions are the ones that would be scalarized if
	// we find that the scalar version costs less.
	SmallVector<Instruction *, 8> Worklist;

	// Returns true if the given instruction can be scalarized.
	auto canBeScalarized = [&](Instruction *I) -> bool {
	// We only attempt to scalarize instructions forming a single-use chain
	// from the original predicated block that would otherwise be vectorized.
	// Although not strictly necessary, we give up on instructions we know will
	// already be scalar to avoid traversing chains that are unlikely to be
	// beneficial.
	if (!I->hasOneUse() \|\| PredInst->getParent() != I->getParent() \|\|
	isScalarAfterVectorization(I, VF))
	return false;

	// If the instruction is scalar with predication, it will be analyzed
	// separately. We ignore it within the context of PredInst.
	if (Legal->isScalarWithPredication(I))
	return false;

	// If any of the instruction's operands are uniform after vectorization,
	// the instruction cannot be scalarized. This prevents, for example, a
	// masked load from being scalarized.
	//
	// We assume we will only emit a value for lane zero of an instruction
	// marked uniform after vectorization, rather than VF identical values.
	// Thus, if we scalarize an instruction that uses a uniform, we would
	// create uses of values corresponding to the lanes we aren't emitting code
	// for. This behavior can be changed by allowing getScalarValue to clone
	// the lane zero values for uniforms rather than asserting.
	for (Use &U : I->operands())
	if (auto *J = dyn_cast<Instruction>(U.get()))
	if (isUniformAfterVectorization(J, VF))
	return false;

	// Otherwise, we can scalarize the instruction.
	return true;
	};

	// Returns true if an operand that cannot be scalarized must be extracted
	// from a vector. We will account for this scalarization overhead below. Note
	// that the non-void predicated instructions are placed in their own blocks,
	// and their return values are inserted into vectors. Thus, an extract would
	// still be required.
	auto needsExtract = [&](Instruction *I) -> bool {
	return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
	};

	// Compute the expected cost discount from scalarizing the entire expression
	// feeding the predicated instruction. We currently only consider expressions
	// that are single-use instruction chains.
	Worklist.push_back(PredInst);
	while (!Worklist.empty()) {
	Instruction *I = Worklist.pop_back_val();

	// If we've already analyzed the instruction, there's nothing to do.
	if (ScalarCosts.count(I))
	continue;

	// Compute the cost of the vector instruction. Note that this cost already
	// includes the scalarization overhead of the predicated instruction.
	unsigned VectorCost = getInstructionCost(I, VF).first;

	// Compute the cost of the scalarized instruction. This cost is the cost of
	// the instruction as if it wasn't if-converted and instead remained in the
	// predicated block. We will scale this cost by block probability after
	// computing the scalarization overhead.
	unsigned ScalarCost = VF * getInstructionCost(I, 1).first;

	// Compute the scalarization overhead of needed insertelement instructions
	// and phi nodes.
	if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
	ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
	true, false);
	ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
	}

	// Compute the scalarization overhead of needed extractelement
	// instructions. For each of the instruction's operands, if the operand can
	// be scalarized, add it to the worklist; otherwise, account for the
	// overhead.
	for (Use &U : I->operands())
	if (auto *J = dyn_cast<Instruction>(U.get())) {
	assert(VectorType::isValidElementType(J->getType()) &&
	"Instruction has non-scalar type");
	if (canBeScalarized(J))
	Worklist.push_back(J);
	else if (needsExtract(J))
	ScalarCost += TTI.getScalarizationOverhead(
	ToVectorTy(J->getType(),VF), false, true);
	}

	// Scale the total scalar cost by block probability.
	ScalarCost /= getReciprocalPredBlockProb();

	// Compute the discount. A non-negative discount means the vector version
	// of the instruction costs more, and scalarizing would be beneficial.
	Discount += VectorCost - ScalarCost;
	ScalarCosts[I] = ScalarCost;
	}

	return Discount;
	}

	LoopVectorizationCostModel::VectorizationCostTy
	LoopVectorizationCostModel::expectedCost(unsigned VF) {
	VectorizationCostTy Cost;

	// For each block.
	for (BasicBlock *BB : TheLoop->blocks()) {
	VectorizationCostTy BlockCost;

	// For each instruction in the old loop.
	for (Instruction &I : *BB) {
	// Skip dbg intrinsics.
	if (isa<DbgInfoIntrinsic>(I))
	continue;

	// Skip ignored values.
	if (ValuesToIgnore.count(&I) \|\|
	(VF > 1 && VecValuesToIgnore.count(&I)))
	continue;

	VectorizationCostTy C = getInstructionCost(&I, VF);

	// Check if we should override the cost.
	if (ForceTargetInstructionCost.getNumOccurrences() > 0)
	C.first = ForceTargetInstructionCost;

	BlockCost.first += C.first;
	BlockCost.second \|= C.second;
	DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
	<< VF << " For instruction: " << I << '\n');
	}

	// If we are vectorizing a predicated block, it will have been
	// if-converted. This means that the block's instructions (aside from
	// stores and instructions that may divide by zero) will now be
	// unconditionally executed. For the scalar case, we may not always execute
	// the predicated block. Thus, scale the block's cost by the probability of
	// executing it.
	if (VF == 1 && Legal->blockNeedsPredication(BB))
	BlockCost.first /= getReciprocalPredBlockProb();

	Cost.first += BlockCost.first;
	Cost.second \|= BlockCost.second;
	}

	return Cost;
	}

	/// \brief Gets Address Access SCEV after verifying that the access pattern
	/// is loop invariant except the induction variable dependence.
	///
	/// This SCEV can be sent to the Target in order to estimate the address
	/// calculation cost.
	static const SCEV *getAddressAccessSCEV(
	Value *Ptr,
	LoopVectorizationLegality *Legal,
	PredicatedScalarEvolution &PSE,
	const Loop *TheLoop) {

	auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
	if (!Gep)
	return nullptr;

	// We are looking for a gep with all loop invariant indices except for one
	// which should be an induction variable.
	auto SE = PSE.getSE();
	unsigned NumOperands = Gep->getNumOperands();
	for (unsigned i = 1; i < NumOperands; ++i) {
	Value *Opd = Gep->getOperand(i);
	if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
	!Legal->isInductionVariable(Opd))
	return nullptr;
	}

	// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
	return PSE.getSCEV(Ptr);
	}

	static bool isStrideMul(Instruction I, LoopVectorizationLegality Legal) {
	return Legal->hasStride(I->getOperand(0)) \|\|
	Legal->hasStride(I->getOperand(1));
	}

	unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
	unsigned VF) {
	Type *ValTy = getMemInstValueType(I);
	auto SE = PSE.getSE();

	unsigned Alignment = getMemInstAlignment(I);
	unsigned AS = getMemInstAddressSpace(I);
	Value *Ptr = getPointerOperand(I);
	Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

	// Figure out whether the access is strided and get the stride value
	// if it's known in compile time
	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);

	// Get the cost of the scalar memory instruction and address computation.
	unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

	Cost += VF *
	TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
	AS, I);

	// Get the overhead of the extractelement and insertelement instructions
	// we might create due to scalarization.
	Cost += getScalarizationOverhead(I, VF, TTI);

	// If we have a predicated store, it may not be executed for each vector
	// lane. Scale the cost by the probability of executing the predicated
	// block.
	if (Legal->isScalarWithPredication(I))
	Cost /= getReciprocalPredBlockProb();

	return Cost;
	}

	unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
	unsigned VF) {
	Type *ValTy = getMemInstValueType(I);
	Type *VectorTy = ToVectorTy(ValTy, VF);
	unsigned Alignment = getMemInstAlignment(I);
	Value *Ptr = getPointerOperand(I);
	unsigned AS = getMemInstAddressSpace(I);
	int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

	assert((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) &&
	"Stride should be 1 or -1 for consecutive memory access");
	unsigned Cost = 0;
	if (Legal->isMaskRequired(I))
	Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
	else
	Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

	bool Reverse = ConsecutiveStride < 0;
	if (Reverse)
	Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
	return Cost;
	}

	unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
	unsigned VF) {
	LoadInst *LI = cast<LoadInst>(I);
	Type *ValTy = LI->getType();
	Type *VectorTy = ToVectorTy(ValTy, VF);
	unsigned Alignment = LI->getAlignment();
	unsigned AS = LI->getPointerAddressSpace();

	return TTI.getAddressComputationCost(ValTy) +
	TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
	TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
	}

	unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
	unsigned VF) {
	Type *ValTy = getMemInstValueType(I);
	Type *VectorTy = ToVectorTy(ValTy, VF);
	unsigned Alignment = getMemInstAlignment(I);
	Value *Ptr = getPointerOperand(I);

	return TTI.getAddressComputationCost(VectorTy) +
	TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
	Legal->isMaskRequired(I), Alignment);
	}

	unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
	unsigned VF) {
	Type *ValTy = getMemInstValueType(I);
	Type *VectorTy = ToVectorTy(ValTy, VF);
	unsigned AS = getMemInstAddressSpace(I);

	auto Group = Legal->getInterleavedAccessGroup(I);
	assert(Group && "Fail to get an interleaved access group.");

	unsigned InterleaveFactor = Group->getFactor();
	Type WideVecTy = VectorType::get(ValTy, VF InterleaveFactor);

	// Holds the indices of existing members in an interleaved load group.
	// An interleaved store group doesn't need this as it doesn't allow gaps.
	SmallVector<unsigned, 4> Indices;
	if (isa<LoadInst>(I)) {
	for (unsigned i = 0; i < InterleaveFactor; i++)
	if (Group->getMember(i))
	Indices.push_back(i);
	}

	// Calculate the cost of the whole interleaved group.
	unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
	Group->getFactor(), Indices,
	Group->getAlignment(), AS);

	if (Group->isReverse())
	Cost += Group->getNumMembers() *
	TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
	return Cost;
	}

	unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
	unsigned VF) {
	// Calculate scalar cost only. Vectorization cost should be ready at this
	// moment.
	if (VF == 1) {
	Type *ValTy = getMemInstValueType(I);
	unsigned Alignment = getMemInstAlignment(I);
	unsigned AS = getMemInstAddressSpace(I);

	return TTI.getAddressComputationCost(ValTy) +
	TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
	}
	return getWideningCost(I, VF);
	}

	LoopVectorizationCostModel::VectorizationCostTy
	LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
	// If we know that this instruction will remain uniform, check the cost of
	// the scalar version.
	if (isUniformAfterVectorization(I, VF))
	VF = 1;

	if (VF > 1 && isProfitableToScalarize(I, VF))
	return VectorizationCostTy(InstsToScalarize[VF][I], false);

	// Forced scalars do not have any scalarization overhead.
	if (VF > 1 && ForcedScalars.count(VF) &&
	ForcedScalars.find(VF)->second.count(I))
	return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);

	Type *VectorTy;
	unsigned C = getInstructionCost(I, VF, VectorTy);

	bool TypeNotScalarized =
	VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
	return VectorizationCostTy(C, TypeNotScalarized);
	}

	void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
	if (VF == 1)
	return;
	for (BasicBlock *BB : TheLoop->blocks()) {
	// For each instruction in the old loop.
	for (Instruction &I : *BB) {
	Value *Ptr = getPointerOperand(&I);
	if (!Ptr)
	continue;

	if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
	// Scalar load + broadcast
	unsigned Cost = getUniformMemOpCost(&I, VF);
	setWideningDecision(&I, VF, CM_Scalarize, Cost);
	continue;
	}

	// We assume that widening is the best solution when possible.
	if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
	unsigned Cost = getConsecutiveMemOpCost(&I, VF);
	int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
	assert((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) &&
	"Expected consecutive stride.");
	InstWidening Decision =
	ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
	setWideningDecision(&I, VF, Decision, Cost);
	continue;
	}

	// Choose between Interleaving, Gather/Scatter or Scalarization.
	unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
	unsigned NumAccesses = 1;
	if (Legal->isAccessInterleaved(&I)) {
	auto Group = Legal->getInterleavedAccessGroup(&I);
	assert(Group && "Fail to get an interleaved access group.");

	// Make one decision for the whole group.
	if (getWideningDecision(&I, VF) != CM_Unknown)
	continue;

	NumAccesses = Group->getNumMembers();
	InterleaveCost = getInterleaveGroupCost(&I, VF);
	}

	unsigned GatherScatterCost =
	Legal->isLegalGatherOrScatter(&I)
	? getGatherScatterCost(&I, VF) * NumAccesses
	: std::numeric_limits<unsigned>::max();

	unsigned ScalarizationCost =
	getMemInstScalarizationCost(&I, VF) * NumAccesses;

	// Choose better solution for the current VF,
	// write down this decision and use it during vectorization.
	unsigned Cost;
	InstWidening Decision;
	if (InterleaveCost <= GatherScatterCost &&
	InterleaveCost < ScalarizationCost) {
	Decision = CM_Interleave;
	Cost = InterleaveCost;
	} else if (GatherScatterCost < ScalarizationCost) {
	Decision = CM_GatherScatter;
	Cost = GatherScatterCost;
	} else {
	Decision = CM_Scalarize;
	Cost = ScalarizationCost;
	}
	// If the instructions belongs to an interleave group, the whole group
	// receives the same decision. The whole group receives the cost, but
	// the cost will actually be assigned to one instruction.
	if (auto Group = Legal->getInterleavedAccessGroup(&I))
	setWideningDecision(Group, VF, Decision, Cost);
	else
	setWideningDecision(&I, VF, Decision, Cost);
	}
	}

	// Make sure that any load of address and any other address computation
	// remains scalar unless there is gather/scatter support. This avoids
	// inevitable extracts into address registers, and also has the benefit of
	// activating LSR more, since that pass can't optimize vectorized
	// addresses.
	if (TTI.prefersVectorizedAddressing())
	return;

	// Start with all scalar pointer uses.
	SmallPtrSet<Instruction *, 8> AddrDefs;
	for (BasicBlock *BB : TheLoop->blocks())
	for (Instruction &I : *BB) {
	Instruction *PtrDef =
	dyn_cast_or_null<Instruction>(getPointerOperand(&I));
	if (PtrDef && TheLoop->contains(PtrDef) &&
	getWideningDecision(&I, VF) != CM_GatherScatter)
	AddrDefs.insert(PtrDef);
	}

	// Add all instructions used to generate the addresses.
	SmallVector<Instruction *, 4> Worklist;
	for (auto *I : AddrDefs)
	Worklist.push_back(I);
	while (!Worklist.empty()) {
	Instruction *I = Worklist.pop_back_val();
	for (auto &Op : I->operands())
	if (auto *InstOp = dyn_cast<Instruction>(Op))
	if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
	AddrDefs.insert(InstOp).second)
	Worklist.push_back(InstOp);
	}

	for (auto *I : AddrDefs) {
	if (isa<LoadInst>(I)) {
	// Setting the desired widening decision should ideally be handled in
	// by cost functions, but since this involves the task of finding out
	// if the loaded register is involved in an address computation, it is
	// instead changed here when we know this is the case.
	InstWidening Decision = getWideningDecision(I, VF);
	if (Decision == CM_Widen \|\| Decision == CM_Widen_Reverse)
	// Scalarize a widened load of address.
	setWideningDecision(I, VF, CM_Scalarize,
	(VF * getMemoryInstructionCost(I, 1)));
	else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
	// Scalarize an interleave group of address loads.
	for (unsigned I = 0; I < Group->getFactor(); ++I) {
	if (Instruction *Member = Group->getMember(I))
	setWideningDecision(Member, VF, CM_Scalarize,
	(VF * getMemoryInstructionCost(Member, 1)));
	}
	}
	} else
	// Make sure I gets scalarized and a cost estimate without
	// scalarization overhead.
	ForcedScalars[VF].insert(I);
	}
	}

	unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
	unsigned VF,
	Type *&VectorTy) {
	Type *RetTy = I->getType();
	if (canTruncateToMinimalBitwidth(I, VF))
	RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
	VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
	auto SE = PSE.getSE();

	// TODO: We need to estimate the cost of intrinsic calls.
	switch (I->getOpcode()) {
	case Instruction::GetElementPtr:
	// We mark this instruction as zero-cost because the cost of GEPs in
	// vectorized code depends on whether the corresponding memory instruction
	// is scalarized or not. Therefore, we handle GEPs with the memory
	// instruction cost.
	return 0;
	case Instruction::Br: {
	// In cases of scalarized and predicated instructions, there will be VF
	// predicated blocks in the vectorized loop. Each branch around these
	// blocks requires also an extract of its vector compare i1 element.
	bool ScalarPredicatedBB = false;
	BranchInst *BI = cast<BranchInst>(I);
	if (VF > 1 && BI->isConditional() &&
	(PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) \|\|
	PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
	ScalarPredicatedBB = true;

	if (ScalarPredicatedBB) {
	// Return cost for branches around scalarized and predicated blocks.
	Type *Vec_i1Ty =
	VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
	return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
	(TTI.getCFInstrCost(Instruction::Br) * VF));
	} else if (I->getParent() == TheLoop->getLoopLatch() \|\| VF == 1)
	// The back-edge branch will remain, as will all scalar branches.
	return TTI.getCFInstrCost(Instruction::Br);
	else
	// This branch will be eliminated by if-conversion.
	return 0;
	// Note: We currently assume zero cost for an unconditional branch inside
	// a predicated block since it will become a fall-through, although we
	// may decide in the future to call TTI for all branches.
	}
	case Instruction::PHI: {
	auto *Phi = cast<PHINode>(I);

	// First-order recurrences are replaced by vector shuffles inside the loop.
	if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
	return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
	VectorTy, VF - 1, VectorTy);

	// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
	// converted into select instructions. We require N - 1 selects per phi
	// node, where N is the number of incoming values.
	if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
	return (Phi->getNumIncomingValues() - 1) *
	TTI.getCmpSelInstrCost(
	Instruction::Select, ToVectorTy(Phi->getType(), VF),
	ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));

	return TTI.getCFInstrCost(Instruction::PHI);
	}
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::URem:
	case Instruction::SRem:
	// If we have a predicated instruction, it may not be executed for each
	// vector lane. Get the scalarization cost and scale this amount by the
	// probability of executing the predicated block. If the instruction is not
	// predicated, we fall through to the next case.
	if (VF > 1 && Legal->isScalarWithPredication(I)) {
	unsigned Cost = 0;

	// These instructions have a non-void type, so account for the phi nodes
	// that we will create. This cost is likely to be zero. The phi node
	// cost, if any, should be scaled by the block probability because it
	// models a copy at the end of each predicated block.
	Cost += VF * TTI.getCFInstrCost(Instruction::PHI);

	// The cost of the non-predicated instruction.
	Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);

	// The cost of insertelement and extractelement instructions needed for
	// scalarization.
	Cost += getScalarizationOverhead(I, VF, TTI);

	// Scale the cost by the probability of executing the predicated blocks.
	// This assumes the predicated block for each vector lane is equally
	// likely.
	return Cost / getReciprocalPredBlockProb();
	}
	LLVM_FALLTHROUGH;
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::FDiv:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	// Since we will replace the stride by 1 the multiplication should go away.
	if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
	return 0;
	// Certain instructions can be cheaper to vectorize if they have a constant
	// second vector operand. One example of this are shifts on x86.
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueProperties Op1VP =
	TargetTransformInfo::OP_None;
	TargetTransformInfo::OperandValueProperties Op2VP =
	TargetTransformInfo::OP_None;
	Value *Op2 = I->getOperand(1);

	// Check for a splat or for a non uniform vector of constants.
	if (isa<ConstantInt>(Op2)) {
	ConstantInt *CInt = cast<ConstantInt>(Op2);
	if (CInt && CInt->getValue().isPowerOf2())
	Op2VP = TargetTransformInfo::OP_PowerOf2;
	Op2VK = TargetTransformInfo::OK_UniformConstantValue;
	} else if (isa<ConstantVector>(Op2) \|\| isa<ConstantDataVector>(Op2)) {
	Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
	Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
	if (SplatValue) {
	ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
	if (CInt && CInt->getValue().isPowerOf2())
	Op2VP = TargetTransformInfo::OP_PowerOf2;
	Op2VK = TargetTransformInfo::OK_UniformConstantValue;
	}
	} else if (Legal->isUniform(Op2)) {
	Op2VK = TargetTransformInfo::OK_UniformValue;
	}
	SmallVector<const Value *, 4> Operands(I->operand_values());
	unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
	return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
	Op2VK, Op1VP, Op2VP, Operands);
	}
	case Instruction::Select: {
	SelectInst *SI = cast<SelectInst>(I);
	const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
	bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
	Type *CondTy = SI->getCondition()->getType();
	if (!ScalarCond)
	CondTy = VectorType::get(CondTy, VF);

	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
	}
	case Instruction::ICmp:
	case Instruction::FCmp: {
	Type *ValTy = I->getOperand(0)->getType();
	Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
	if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
	ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
	VectorTy = ToVectorTy(ValTy, VF);
	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
	}
	case Instruction::Store:
	case Instruction::Load: {
	unsigned Width = VF;
	if (Width > 1) {
	InstWidening Decision = getWideningDecision(I, Width);
	assert(Decision != CM_Unknown &&
	"CM decision should be taken at this point");
	if (Decision == CM_Scalarize)
	Width = 1;
	}
	VectorTy = ToVectorTy(getMemInstValueType(I), Width);
	return getMemoryInstructionCost(I, VF);
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	// We optimize the truncation of induction variables having constant
	// integer steps. The cost of these truncations is the same as the scalar
	// operation.
	if (isOptimizableIVTruncate(I, VF)) {
	auto *Trunc = cast<TruncInst>(I);
	return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
	Trunc->getSrcTy(), Trunc);
	}

	Type *SrcScalarTy = I->getOperand(0)->getType();
	Type *SrcVecTy =
	VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
	if (canTruncateToMinimalBitwidth(I, VF)) {
	// This cast is going to be shrunk. This may remove the cast or it might
	// turn it into slightly different cast. For example, if MinBW == 16,
	// "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
	//
	// Calculate the modified src and dest types.
	Type *MinVecTy = VectorTy;
	if (I->getOpcode() == Instruction::Trunc) {
	SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
	VectorTy =
	largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
	} else if (I->getOpcode() == Instruction::ZExt \|\|
	I->getOpcode() == Instruction::SExt) {
	SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
	VectorTy =
	smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
	}
	}

	unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
	return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
	}
	case Instruction::Call: {
	bool NeedToScalarize;
	CallInst *CI = cast<CallInst>(I);
	unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
	if (getVectorIntrinsicIDForCall(CI, TLI))
	return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
	return CallCost;
	}
	default:
	// The cost of executing VF copies of the scalar instruction. This opcode
	// is unknown. Assume that it is the same as 'mul'.
	return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
	getScalarizationOverhead(I, VF, TTI);
	} // end of switch.
	}

	char LoopVectorize::ID = 0;

	static const char lv_name[] = "Loop Vectorization";

	INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
	INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
	INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)

	namespace llvm {

	Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
	return new LoopVectorize(NoUnrolling, AlwaysVectorize);
	}

	} // end namespace llvm

	bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
	// Check if the pointer operand of a load or store instruction is
	// consecutive.
	if (auto *Ptr = getPointerOperand(Inst))
	return Legal->isConsecutivePtr(Ptr);
	return false;
	}

	void LoopVectorizationCostModel::collectValuesToIgnore() {
	// Ignore ephemeral values.
	CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

	// Ignore type-promoting instructions we identified during reduction
	// detection.
	for (auto &Reduction : *Legal->getReductionVars()) {
	RecurrenceDescriptor &RedDes = Reduction.second;
	SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
	VecValuesToIgnore.insert(Casts.begin(), Casts.end());
	}
	// Ignore type-casting instructions we identified during induction
	// detection.
	for (auto &Induction : *Legal->getInductionVars()) {
	InductionDescriptor &IndDes = Induction.second;
	const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
	VecValuesToIgnore.insert(Casts.begin(), Casts.end());
	}
	}

	LoopVectorizationCostModel::VectorizationFactor
	LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
	// Width 1 means no vectorize, cost 0 means uncomputed cost.
	const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
	0U};
	Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
	if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
	return NoVectorization;

	if (UserVF) {
	DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
	assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
	// Collect the instructions (and their associated costs) that will be more
	// profitable to scalarize.
	CM.selectUserVectorizationFactor(UserVF);
	buildVPlans(UserVF, UserVF);
	DEBUG(printPlans(dbgs()));
	return {UserVF, 0};
	}

	unsigned MaxVF = MaybeMaxVF.getValue();
	assert(MaxVF != 0 && "MaxVF is zero.");

	for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
	// Collect Uniform and Scalar instructions after vectorization with VF.
	CM.collectUniformsAndScalars(VF);

	// Collect the instructions (and their associated costs) that will be more
	// profitable to scalarize.
	if (VF > 1)
	CM.collectInstsToScalarize(VF);
	}

	buildVPlans(1, MaxVF);
	DEBUG(printPlans(dbgs()));
	if (MaxVF == 1)
	return NoVectorization;

	// Select the optimal vectorization factor.
	return CM.selectVectorizationFactor(MaxVF);
	}

	void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
	DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
	BestVF = VF;
	BestUF = UF;

	erase_if(VPlans, [VF](const VPlanPtr &Plan) {
	return !Plan->hasVF(VF);
	});
	assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
	}

	void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
	DominatorTree *DT) {
	// Perform the actual loop transformation.

	// 1. Create a new empty loop. Unlink the old loop and connect the new one.
	VPCallbackILV CallbackILV(ILV);

	VPTransformState State{BestVF, BestUF, LI,
	DT, ILV.Builder, ILV.VectorLoopValueMap,
	&ILV, CallbackILV};
	State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();

	//===------------------------------------------------===//
	//
	// Notice: any optimization or new instruction that go
	// into the code below should also be implemented in
	// the cost-model.
	//
	//===------------------------------------------------===//

	// 2. Copy and widen instructions from the old loop into the new loop.
	assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
	VPlans.front()->execute(&State);

	// 3. Fix the vectorized code: take care of header phi's, live-outs,
	// predication, updating analyses.
	ILV.fixVectorizedLoop();
	}

	void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
	SmallPtrSetImpl<Instruction *> &DeadInstructions) {
	BasicBlock *Latch = OrigLoop->getLoopLatch();

	// We create new control-flow for the vectorized loop, so the original
	// condition will be dead after vectorization if it's only used by the
	// branch.
	auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
	if (Cmp && Cmp->hasOneUse())
	DeadInstructions.insert(Cmp);

	// We create new "steps" for induction variable updates to which the original
	// induction variables map. An original update instruction will be dead if
	// all its users except the induction variable are dead.
	for (auto &Induction : *Legal->getInductionVars()) {
	PHINode *Ind = Induction.first;
	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
	if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
	return U == Ind \|\| DeadInstructions.count(cast<Instruction>(U));
	}))
	DeadInstructions.insert(IndUpdate);

	// We record as "Dead" also the type-casting instructions we had identified
	// during induction analysis. We don't need any handling for them in the
	// vectorized loop because we have proven that, under a proper runtime
	// test guarding the vectorized loop, the value of the phi, and the casted
	// value of the phi, are the same. The last instruction in this casting chain
	// will get its scalar/vector/widened def from the scalar/vector/widened def
	// of the respective phi node. Any other casts in the induction def-use chain
	// have no other uses outside the phi update chain, and will be ignored.
	InductionDescriptor &IndDes = Induction.second;
	const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
	DeadInstructions.insert(Casts.begin(), Casts.end());
	}
	}

	Value InnerLoopUnroller::reverseVector(Value Vec) { return Vec; }

	Value InnerLoopUnroller::getBroadcastInstrs(Value V) { return V; }

	Value InnerLoopUnroller::getStepVector(Value Val, int StartIdx, Value *Step,
	Instruction::BinaryOps BinOp) {
	// When unrolling and the VF is 1, we only need to add a simple scalar.
	Type *Ty = Val->getType();
	assert(!Ty->isVectorTy() && "Val must be a scalar");

	if (Ty->isFloatingPointTy()) {
	Constant *C = ConstantFP::get(Ty, (double)StartIdx);

	// Floating point operations had to be 'fast' to enable the unrolling.
	Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
	return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
	}
	Constant *C = ConstantInt::get(Ty, StartIdx);
	return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
	}

	static void AddRuntimeUnrollDisableMetaData(Loop *L) {
	SmallVector<Metadata *, 4> MDs;
	// Reserve first location for self reference to the LoopID metadata node.
	MDs.push_back(nullptr);
	bool IsUnrollMetadata = false;
	MDNode *LoopID = L->getLoopID();
	if (LoopID) {
	// First find existing loop unrolling disable metadata.
	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
	auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
	if (MD) {
	const auto *S = dyn_cast<MDString>(MD->getOperand(0));
	IsUnrollMetadata =
	S && S->getString().startswith("llvm.loop.unroll.disable");
	}
	MDs.push_back(LoopID->getOperand(i));
	}
	}

	if (!IsUnrollMetadata) {
	// Add runtime unroll disable metadata.
	LLVMContext &Context = L->getHeader()->getContext();
	SmallVector<Metadata *, 1> DisableOperands;
	DisableOperands.push_back(
	MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
	MDNode *DisableNode = MDNode::get(Context, DisableOperands);
	MDs.push_back(DisableNode);
	MDNode *NewLoopID = MDNode::get(Context, MDs);
	// Set operand 0 to refer to the loop id itself.
	NewLoopID->replaceOperandWith(0, NewLoopID);
	L->setLoopID(NewLoopID);
	}
	}

	bool LoopVectorizationPlanner::getDecisionAndClampRange(
	const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
	assert(Range.End > Range.Start && "Trying to test an empty VF range.");
	bool PredicateAtRangeStart = Predicate(Range.Start);

	for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
	if (Predicate(TmpVF) != PredicateAtRangeStart) {
	Range.End = TmpVF;
	break;
	}

	return PredicateAtRangeStart;
	}

	/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
	/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
	/// of VF's starting at a given VF and extending it as much as possible. Each
	/// vectorization decision can potentially shorten this sub-range during
	/// buildVPlan().
	void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {

	// Collect conditions feeding internal conditional branches; they need to be
	// represented in VPlan for it to model masking.
	SmallPtrSet<Value *, 1> NeedDef;

	auto *Latch = OrigLoop->getLoopLatch();
	for (BasicBlock *BB : OrigLoop->blocks()) {
	if (BB == Latch)
	continue;
	BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
	if (Branch && Branch->isConditional())
	NeedDef.insert(Branch->getCondition());
	}

	for (unsigned VF = MinVF; VF < MaxVF + 1;) {
	VFRange SubRange = {VF, MaxVF + 1};
	VPlans.push_back(buildVPlan(SubRange, NeedDef));
	VF = SubRange.End;
	}
	}

	VPValue LoopVectorizationPlanner::createEdgeMask(BasicBlock Src,
	BasicBlock *Dst,
	VPlanPtr &Plan) {
	assert(is_contained(predecessors(Dst), Src) && "Invalid edge");

	// Look for cached value.
	std::pair<BasicBlock , BasicBlock > Edge(Src, Dst);
	EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
	if (ECEntryIt != EdgeMaskCache.end())
	return ECEntryIt->second;

	VPValue *SrcMask = createBlockInMask(Src, Plan);

	// The terminator has to be a branch inst!
	BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
	assert(BI && "Unexpected terminator found");

	if (!BI->isConditional())
	return EdgeMaskCache[Edge] = SrcMask;

	VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
	assert(EdgeMask && "No Edge Mask found for condition");

	if (BI->getSuccessor(0) != Dst)
	EdgeMask = Builder.createNot(EdgeMask);

	if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
	EdgeMask = Builder.createAnd(EdgeMask, SrcMask);

	return EdgeMaskCache[Edge] = EdgeMask;
	}

	VPValue LoopVectorizationPlanner::createBlockInMask(BasicBlock BB,
	VPlanPtr &Plan) {
	assert(OrigLoop->contains(BB) && "Block is not a part of a loop");

	// Look for cached value.
	BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
	if (BCEntryIt != BlockMaskCache.end())
	return BCEntryIt->second;

	// All-one mask is modelled as no-mask following the convention for masked
	// load/store/gather/scatter. Initialize BlockMask to no-mask.
	VPValue *BlockMask = nullptr;

	// Loop incoming mask is all-one.
	if (OrigLoop->getHeader() == BB)
	return BlockMaskCache[BB] = BlockMask;

	// This is the block mask. We OR all incoming edges.
	for (auto *Predecessor : predecessors(BB)) {
	VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
	if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
	return BlockMaskCache[BB] = EdgeMask;

	if (!BlockMask) { // BlockMask has its initialized nullptr value.
	BlockMask = EdgeMask;
	continue;
	}

	BlockMask = Builder.createOr(BlockMask, EdgeMask);
	}

	return BlockMaskCache[BB] = BlockMask;
	}

	VPInterleaveRecipe *
	LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
	VFRange &Range) {
	const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
	if (!IG)
	return nullptr;

	// Now check if IG is relevant for VF's in the given range.
	auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
	return [=](unsigned VF) -> bool {
	return (VF >= 2 && // Query is illegal for VF == 1
	CM.getWideningDecision(I, VF) ==
	LoopVectorizationCostModel::CM_Interleave);
	};
	};
	if (!getDecisionAndClampRange(isIGMember(I), Range))
	return nullptr;

	// I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
	// range. If it's the primary member of the IG construct a VPInterleaveRecipe.
	// Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
	assert(I == IG->getInsertPos() &&
	"Generating a recipe for an adjunct member of an interleave group");

	return new VPInterleaveRecipe(IG);
	}

	VPWidenMemoryInstructionRecipe *
	LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
	VPlanPtr &Plan) {
	if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
	return nullptr;

	auto willWiden = [&](unsigned VF) -> bool {
	if (VF == 1)
	return false;
	if (CM.isScalarAfterVectorization(I, VF) \|\|
	CM.isProfitableToScalarize(I, VF))
	return false;
	LoopVectorizationCostModel::InstWidening Decision =
	CM.getWideningDecision(I, VF);
	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
	"CM decision should be taken at this point.");
	assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
	"Interleave memory opportunity should be caught earlier.");
	return Decision != LoopVectorizationCostModel::CM_Scalarize;
	};

	if (!getDecisionAndClampRange(willWiden, Range))
	return nullptr;

	VPValue *Mask = nullptr;
	if (Legal->isMaskRequired(I))
	Mask = createBlockInMask(I->getParent(), Plan);

	return new VPWidenMemoryInstructionRecipe(*I, Mask);
	}

	VPWidenIntOrFpInductionRecipe *
	LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
	VFRange &Range) {
	if (PHINode *Phi = dyn_cast<PHINode>(I)) {
	// Check if this is an integer or fp induction. If so, build the recipe that
	// produces its scalar and vector values.
	InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
	if (II.getKind() == InductionDescriptor::IK_IntInduction \|\|
	II.getKind() == InductionDescriptor::IK_FpInduction)
	return new VPWidenIntOrFpInductionRecipe(Phi);

	return nullptr;
	}

	// Optimize the special case where the source is a constant integer
	// induction variable. Notice that we can only optimize the 'trunc' case
	// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
	// (c) other casts depend on pointer size.

	// Determine whether \p K is a truncation based on an induction variable that
	// can be optimized.
	auto isOptimizableIVTruncate =
	[&](Instruction *K) -> std::function<bool(unsigned)> {
	return
	[=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
	};

	if (isa<TruncInst>(I) &&
	getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
	return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
	cast<TruncInst>(I));
	return nullptr;
	}

	VPBlendRecipe *
	LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
	PHINode *Phi = dyn_cast<PHINode>(I);
	if (!Phi \|\| Phi->getParent() == OrigLoop->getHeader())
	return nullptr;

	// We know that all PHIs in non-header blocks are converted into selects, so
	// we don't have to worry about the insertion order and we can just use the
	// builder. At this point we generate the predication tree. There may be
	// duplications since this is a simple recursive scan, but future
	// optimizations will clean it up.

	SmallVector<VPValue *, 2> Masks;
	unsigned NumIncoming = Phi->getNumIncomingValues();
	for (unsigned In = 0; In < NumIncoming; In++) {
	VPValue *EdgeMask =
	createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
	assert((EdgeMask \|\| NumIncoming == 1) &&
	"Multiple predecessors with one having a full mask");
	if (EdgeMask)
	Masks.push_back(EdgeMask);
	}
	return new VPBlendRecipe(Phi, Masks);
	}

	bool LoopVectorizationPlanner::tryToWiden(Instruction I, VPBasicBlock VPBB,
	VFRange &Range) {
	if (Legal->isScalarWithPredication(I))
	return false;

	auto IsVectorizableOpcode = [](unsigned Opcode) {
	switch (Opcode) {
	case Instruction::Add:
	case Instruction::And:
	case Instruction::AShr:
	case Instruction::BitCast:
	case Instruction::Br:
	case Instruction::Call:
	case Instruction::FAdd:
	case Instruction::FCmp:
	case Instruction::FDiv:
	case Instruction::FMul:
	case Instruction::FPExt:
	case Instruction::FPToSI:
	case Instruction::FPToUI:
	case Instruction::FPTrunc:
	case Instruction::FRem:
	case Instruction::FSub:
	case Instruction::GetElementPtr:
	case Instruction::ICmp:
	case Instruction::IntToPtr:
	case Instruction::Load:
	case Instruction::LShr:
	case Instruction::Mul:
	case Instruction::Or:
	case Instruction::PHI:
	case Instruction::PtrToInt:
	case Instruction::SDiv:
	case Instruction::Select:
	case Instruction::SExt:
	case Instruction::Shl:
	case Instruction::SIToFP:
	case Instruction::SRem:
	case Instruction::Store:
	case Instruction::Sub:
	case Instruction::Trunc:
	case Instruction::UDiv:
	case Instruction::UIToFP:
	case Instruction::URem:
	case Instruction::Xor:
	case Instruction::ZExt:
	return true;
	}
	return false;
	};

	if (!IsVectorizableOpcode(I->getOpcode()))
	return false;

	if (CallInst *CI = dyn_cast<CallInst>(I)) {
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
	ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect))
	return false;
	}

	auto willWiden = [&](unsigned VF) -> bool {
	if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) \|\|
	CM.isProfitableToScalarize(I, VF)))
	return false;
	if (CallInst *CI = dyn_cast<CallInst>(I)) {
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	// The following case may be scalarized depending on the VF.
	// The flag shows whether we use Intrinsic or a usual Call for vectorized
	// version of the instruction.
	// Is it beneficial to perform intrinsic call compared to lib call?
	bool NeedToScalarize;
	unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
	bool UseVectorIntrinsic =
	ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
	return UseVectorIntrinsic \|\| !NeedToScalarize;
	}
	if (isa<LoadInst>(I) \|\| isa<StoreInst>(I)) {
	assert(CM.getWideningDecision(I, VF) ==
	LoopVectorizationCostModel::CM_Scalarize &&
	"Memory widening decisions should have been taken care by now");
	return false;
	}
	return true;
	};

	if (!getDecisionAndClampRange(willWiden, Range))
	return false;

	// Success: widen this instruction. We optimize the common case where
	// consecutive instructions can be represented by a single recipe.
	if (!VPBB->empty()) {
	VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
	if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
	return true;
	}

	VPBB->appendRecipe(new VPWidenRecipe(I));
	return true;
	}

	VPBasicBlock *LoopVectorizationPlanner::handleReplication(
	Instruction I, VFRange &Range, VPBasicBlock VPBB,
	DenseMap<Instruction , VPReplicateRecipe > &PredInst2Recipe,
	VPlanPtr &Plan) {
	bool IsUniform = getDecisionAndClampRange(
	[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
	Range);

	bool IsPredicated = Legal->isScalarWithPredication(I);
	auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);

	// Find if I uses a predicated instruction. If so, it will use its scalar
	// value. Avoid hoisting the insert-element which packs the scalar value into
	// a vector value, as that happens iff all users use the vector value.
	for (auto &Op : I->operands())
	if (auto *PredInst = dyn_cast<Instruction>(Op))
	if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
	PredInst2Recipe[PredInst]->setAlsoPack(false);

	// Finalize the recipe for Instr, first if it is not predicated.
	if (!IsPredicated) {
	DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
	VPBB->appendRecipe(Recipe);
	return VPBB;
	}
	DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
	assert(VPBB->getSuccessors().empty() &&
	"VPBB has successors when handling predicated replication.");
	// Record predicated instructions for above packing optimizations.
	PredInst2Recipe[I] = Recipe;
	VPBlockBase *Region =
	VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
	return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
	}

	VPRegionBlock *
	LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
	VPRecipeBase *PredRecipe,
	VPlanPtr &Plan) {
	// Instructions marked for predication are replicated and placed under an
	// if-then construct to prevent side-effects.

	// Generate recipes to compute the block mask for this region.
	VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);

	// Build the triangular if-then region.
	std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
	assert(Instr->getParent() && "Predicated instruction not in any basic block");
	auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
	auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
	auto *PHIRecipe =
	Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
	auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
	auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
	VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);

	// Note: first set Entry as region entry and then connect successors starting
	// from it in order, to propagate the "parent" of each VPBasicBlock.
	Entry->setTwoSuccessors(Pred, Exit);
	Pred->setOneSuccessor(Exit);

	return Region;
	}

	LoopVectorizationPlanner::VPlanPtr
	LoopVectorizationPlanner::buildVPlan(VFRange &Range,
	const SmallPtrSetImpl<Value *> &NeedDef) {
	EdgeMaskCache.clear();
	BlockMaskCache.clear();
	DenseMap<Instruction , Instruction > &SinkAfter = Legal->getSinkAfter();
	DenseMap<Instruction , Instruction > SinkAfterInverse;

	// Collect instructions from the original loop that will become trivially dead
	// in the vectorized loop. We don't need to vectorize these instructions. For
	// example, original induction update instructions can become dead because we
	// separately emit induction "steps" when generating code for the new loop.
	// Similarly, we create a new latch condition when setting up the structure
	// of the new loop, so the old one can become dead.
	SmallPtrSet<Instruction *, 4> DeadInstructions;
	collectTriviallyDeadInstructions(DeadInstructions);

	// Hold a mapping from predicated instructions to their recipes, in order to
	// fix their AlsoPack behavior if a user is determined to replicate and use a
	// scalar instead of vector value.
	DenseMap<Instruction , VPReplicateRecipe > PredInst2Recipe;

	// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
	VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
	auto Plan = llvm::make_unique<VPlan>(VPBB);

	// Represent values that will have defs inside VPlan.
	for (Value *V : NeedDef)
	Plan->addVPValue(V);

	// Scan the body of the loop in a topological order to visit each basic block
	// after having visited its predecessor basic blocks.
	LoopBlocksDFS DFS(OrigLoop);
	DFS.perform(LI);

	for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
	// Relevant instructions from basic block BB will be grouped into VPRecipe
	// ingredients and fill a new VPBasicBlock.
	unsigned VPBBsForBB = 0;
	auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
	VPBB->setOneSuccessor(FirstVPBBForBB);
	VPBB = FirstVPBBForBB;
	Builder.setInsertPoint(VPBB);

	std::vector<Instruction *> Ingredients;

	// Organize the ingredients to vectorize from current basic block in the
	// right order.
	for (Instruction &I : *BB) {
	Instruction *Instr = &I;

	// First filter out irrelevant instructions, to ensure no recipes are
	// built for them.
	if (isa<BranchInst>(Instr) \|\| isa<DbgInfoIntrinsic>(Instr) \|\|
	DeadInstructions.count(Instr))
	continue;

	// I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
	// member of the IG, do not construct any Recipe for it.
	const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
	if (IG && Instr != IG->getInsertPos() &&
	Range.Start >= 2 && // Query is illegal for VF == 1
	CM.getWideningDecision(Instr, Range.Start) ==
	LoopVectorizationCostModel::CM_Interleave) {
	if (SinkAfterInverse.count(Instr))
	Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
	continue;
	}

	// Move instructions to handle first-order recurrences, step 1: avoid
	// handling this instruction until after we've handled the instruction it
	// should follow.
	auto SAIt = SinkAfter.find(Instr);
	if (SAIt != SinkAfter.end()) {
	DEBUG(dbgs() << "Sinking" << SAIt->first << " after" << SAIt->second
	<< " to vectorize a 1st order recurrence.\n");
	SinkAfterInverse[SAIt->second] = Instr;
	continue;
	}

	Ingredients.push_back(Instr);

	// Move instructions to handle first-order recurrences, step 2: push the
	// instruction to be sunk at its insertion point.
	auto SAInvIt = SinkAfterInverse.find(Instr);
	if (SAInvIt != SinkAfterInverse.end())
	Ingredients.push_back(SAInvIt->second);
	}

	// Introduce each ingredient into VPlan.
	for (Instruction *Instr : Ingredients) {
	VPRecipeBase *Recipe = nullptr;

	// Check if Instr should belong to an interleave memory recipe, or already
	// does. In the latter case Instr is irrelevant.
	if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
	VPBB->appendRecipe(Recipe);
	continue;
	}

	// Check if Instr is a memory operation that should be widened.
	if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
	VPBB->appendRecipe(Recipe);
	continue;
	}

	// Check if Instr should form some PHI recipe.
	if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
	VPBB->appendRecipe(Recipe);
	continue;
	}
	if ((Recipe = tryToBlend(Instr, Plan))) {
	VPBB->appendRecipe(Recipe);
	continue;
	}
	if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
	VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
	continue;
	}

	// Check if Instr is to be widened by a general VPWidenRecipe, after
	// having first checked for specific widening recipes that deal with
	// Interleave Groups, Inductions and Phi nodes.
	if (tryToWiden(Instr, VPBB, Range))
	continue;

	// Otherwise, if all widening options failed, Instruction is to be
	// replicated. This may create a successor for VPBB.
	VPBasicBlock *NextVPBB =
	handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
	if (NextVPBB != VPBB) {
	VPBB = NextVPBB;
	VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
	: "");
	}
	}
	}

	// Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
	// may also be empty, such as the last one VPBB, reflecting original
	// basic-blocks with no recipes.
	VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
	assert(PreEntry->empty() && "Expecting empty pre-entry block.");
	VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
	PreEntry->disconnectSuccessor(Entry);
	delete PreEntry;

	std::string PlanName;
	raw_string_ostream RSO(PlanName);
	unsigned VF = Range.Start;
	Plan->addVF(VF);
	RSO << "Initial VPlan for VF={" << VF;
	for (VF = 2; VF < Range.End; VF = 2) {
	Plan->addVF(VF);
	RSO << "," << VF;
	}
	RSO << "},UF>=1";
	RSO.flush();
	Plan->setName(PlanName);

	return Plan;
	}

	void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
	O << " +\n"
	<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
	IG->getInsertPos()->printAsOperand(O, false);
	O << "\\l\"";
	for (unsigned i = 0; i < IG->getFactor(); ++i)
	if (Instruction *I = IG->getMember(i))
	O << " +\n"
	<< Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
	}

	void VPWidenRecipe::execute(VPTransformState &State) {
	for (auto &Instr : make_range(Begin, End))
	State.ILV->widenInstruction(Instr);
	}

	void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
	assert(!State.Instance && "Int or FP induction being replicated.");
	State.ILV->widenIntOrFpInduction(IV, Trunc);
	}

	void VPWidenPHIRecipe::execute(VPTransformState &State) {
	State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
	}

	void VPBlendRecipe::execute(VPTransformState &State) {
	State.ILV->setDebugLocFromInst(State.Builder, Phi);
	// We know that all PHIs in non-header blocks are converted into
	// selects, so we don't have to worry about the insertion order and we
	// can just use the builder.
	// At this point we generate the predication tree. There may be
	// duplications since this is a simple recursive scan, but future
	// optimizations will clean it up.

	unsigned NumIncoming = Phi->getNumIncomingValues();

	assert((User \|\| NumIncoming == 1) &&
	"Multiple predecessors with predecessors having a full mask");
	// Generate a sequence of selects of the form:
	// SELECT(Mask3, In3,
	// SELECT(Mask2, In2,
	// ( ...)))
	InnerLoopVectorizer::VectorParts Entry(State.UF);
	for (unsigned In = 0; In < NumIncoming; ++In) {
	for (unsigned Part = 0; Part < State.UF; ++Part) {
	// We might have single edge PHIs (blocks) - use an identity
	// 'select' for the first PHI operand.
	Value *In0 =
	State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
	if (In == 0)
	Entry[Part] = In0; // Initialize with the first incoming value.
	else {
	// Select between the current value and the previous incoming edge
	// based on the incoming mask.
	Value *Cond = State.get(User->getOperand(In), Part);
	Entry[Part] =
	State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
	}
	}
	}
	for (unsigned Part = 0; Part < State.UF; ++Part)
	State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
	}

	void VPInterleaveRecipe::execute(VPTransformState &State) {
	assert(!State.Instance && "Interleave group being replicated.");
	State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
	}

	void VPReplicateRecipe::execute(VPTransformState &State) {
	if (State.Instance) { // Generate a single instance.
	State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
	// Insert scalar instance packing it into a vector.
	if (AlsoPack && State.VF > 1) {
	// If we're constructing lane 0, initialize to start from undef.
	if (State.Instance->Lane == 0) {
	Value *Undef =
	UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
	State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
	}
	State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
	}
	return;
	}

	// Generate scalar instances for all VF lanes of all UF parts, unless the
	// instruction is uniform inwhich case generate only the first lane for each
	// of the UF parts.
	unsigned EndLane = IsUniform ? 1 : State.VF;
	for (unsigned Part = 0; Part < State.UF; ++Part)
	for (unsigned Lane = 0; Lane < EndLane; ++Lane)
	State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
	}

	void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
	assert(State.Instance && "Branch on Mask works only on single instance.");

	unsigned Part = State.Instance->Part;
	unsigned Lane = State.Instance->Lane;

	Value *ConditionBit = nullptr;
	if (!User) // Block in mask is all-one.
	ConditionBit = State.Builder.getTrue();
	else {
	VPValue *BlockInMask = User->getOperand(0);
	ConditionBit = State.get(BlockInMask, Part);
	if (ConditionBit->getType()->isVectorTy())
	ConditionBit = State.Builder.CreateExtractElement(
	ConditionBit, State.Builder.getInt32(Lane));
	}

	// Replace the temporary unreachable terminator with a new conditional branch,
	// whose two destinations will be set later when they are created.
	auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
	assert(isa<UnreachableInst>(CurrentTerminator) &&
	"Expected to replace unreachable terminator with conditional branch.");
	auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
	CondBr->setSuccessor(0, nullptr);
	ReplaceInstWithInst(CurrentTerminator, CondBr);
	}

	void VPPredInstPHIRecipe::execute(VPTransformState &State) {
	assert(State.Instance && "Predicated instruction PHI works per instance.");
	Instruction *ScalarPredInst = cast<Instruction>(
	State.ValueMap.getScalarValue(PredInst, *State.Instance));
	BasicBlock *PredicatedBB = ScalarPredInst->getParent();
	BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
	assert(PredicatingBB && "Predicated block has no single predecessor.");

	// By current pack/unpack logic we need to generate only a single phi node: if
	// a vector value for the predicated instruction exists at this point it means
	// the instruction has vector users only, and a phi for the vector value is
	// needed. In this case the recipe of the predicated instruction is marked to
	// also do that packing, thereby "hoisting" the insert-element sequence.
	// Otherwise, a phi node for the scalar value is needed.
	unsigned Part = State.Instance->Part;
	if (State.ValueMap.hasVectorValue(PredInst, Part)) {
	Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
	InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
	PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
	VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
	VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
	State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
	} else {
	Type *PredInstType = PredInst->getType();
	PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
	Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
	Phi->addIncoming(ScalarPredInst, PredicatedBB);
	State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
	}
	}

	void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
	if (!User)
	return State.ILV->vectorizeMemoryInstruction(&Instr);

	// Last (and currently only) operand is a mask.
	InnerLoopVectorizer::VectorParts MaskValues(State.UF);
	VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
	for (unsigned Part = 0; Part < State.UF; ++Part)
	MaskValues[Part] = State.get(Mask, Part);
	State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
	}

	bool LoopVectorizePass::processLoop(Loop *L) {
	assert(L->empty() && "Only process inner loops.");

	#ifndef NDEBUG
	const std::string DebugLocStr = getDebugLocString(L);
	#endif /* NDEBUG */

	DEBUG(dbgs() << "\nLV: Checking a loop in \""
	<< L->getHeader()->getParent()->getName() << "\" from "
	<< DebugLocStr << "\n");

	LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);

	DEBUG(dbgs() << "LV: Loop hints:"
	<< " force="
	<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
	? "disabled"
	: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
	? "enabled"
	: "?"))
	<< " width=" << Hints.getWidth()
	<< " unroll=" << Hints.getInterleave() << "\n");

	// Function containing loop
	Function *F = L->getHeader()->getParent();

	// Looking at the diagnostic output is the only way to determine if a loop
	// was vectorized (other than looking at the IR or machine code), so it
	// is important to generate an optimization remark for each loop. Most of
	// these messages are generated as OptimizationRemarkAnalysis. Remarks
	// generated as OptimizationRemark and OptimizationRemarkMissed are
	// less verbose reporting vectorized loops and unvectorized loops that may
	// benefit from vectorization, respectively.

	if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
	DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
	return false;
	}

	PredicatedScalarEvolution PSE(SE, L);

	// Check if it is legal to vectorize the loop.
	LoopVectorizationRequirements Requirements(*ORE);
	LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
	&Requirements, &Hints);
	if (!LVL.canVectorize()) {
	DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
	emitMissedWarning(F, L, Hints, ORE);
	return false;
	}

	// Check the function attributes to find out if this function should be
	// optimized for size.
	bool OptForSize =
	Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();

	// Check the loop for a trip count threshold: vectorize loops with a tiny trip
	// count by optimizing for size, to minimize overheads.
	unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
	bool HasExpectedTC = (ExpectedTC > 0);

	if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
	auto EstimatedTC = getLoopEstimatedTripCount(L);
	if (EstimatedTC) {
	ExpectedTC = *EstimatedTC;
	HasExpectedTC = true;
	}
	}

	if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
	DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
	<< "This loop is worth vectorizing only if no scalar "
	<< "iteration overheads are incurred.");
	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
	DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
	else {
	DEBUG(dbgs() << "\n");
	// Loops with a very small trip count are considered for vectorization
	// under OptForSize, thereby making sure the cost of their loop body is
	// dominant, free of runtime guards and scalar iteration overheads.
	OptForSize = true;
	}
	}

	// Check the function attributes to see if implicit floats are allowed.
	// FIXME: This check doesn't seem possibly correct -- what if the loop is
	// an integer loop and the vector instructions selected are purely integer
	// vector instructions?
	if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
	DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
	"attribute is used.\n");
	ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
	"NoImplicitFloat", L)
	<< "loop not vectorized due to NoImplicitFloat attribute");
	emitMissedWarning(F, L, Hints, ORE);
	return false;
	}

	// Check if the target supports potentially unsafe FP vectorization.
	// FIXME: Add a check for the type of safety issue (denormal, signaling)
	// for the target we're vectorizing for, to make sure none of the
	// additional fp-math flags can help.
	if (Hints.isPotentiallyUnsafe() &&
	TTI->isFPVectorizationPotentiallyUnsafe()) {
	DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
	ORE->emit(
	createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
	<< "loop not vectorized due to unsafe FP support.");
	emitMissedWarning(F, L, Hints, ORE);
	return false;
	}

	// Use the cost model.
	LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
	&Hints);
	CM.collectValuesToIgnore();

	// Use the planner for vectorization.
	LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);

	// Get user vectorization factor.
	unsigned UserVF = Hints.getWidth();

	// Plan how to best vectorize, return the best VF and its cost.
	LoopVectorizationCostModel::VectorizationFactor VF =
	LVP.plan(OptForSize, UserVF);

	// Select the interleave count.
	unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);

	// Get user interleave count.
	unsigned UserIC = Hints.getInterleave();

	// Identify the diagnostic messages that should be produced.
	std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
	bool VectorizeLoop = true, InterleaveLoop = true;
	if (Requirements.doesNotMeet(F, L, Hints)) {
	DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
	"requirements.\n");
	emitMissedWarning(F, L, Hints, ORE);
	return false;
	}

	if (VF.Width == 1) {
	DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
	VecDiagMsg = std::make_pair(
	"VectorizationNotBeneficial",
	"the cost-model indicates that vectorization is not beneficial");
	VectorizeLoop = false;
	}

	if (IC == 1 && UserIC <= 1) {
	// Tell the user interleaving is not beneficial.
	DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
	IntDiagMsg = std::make_pair(
	"InterleavingNotBeneficial",
	"the cost-model indicates that interleaving is not beneficial");
	InterleaveLoop = false;
	if (UserIC == 1) {
	IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
	IntDiagMsg.second +=
	" and is explicitly disabled or interleave count is set to 1";
	}
	} else if (IC > 1 && UserIC == 1) {
	// Tell the user interleaving is beneficial, but it explicitly disabled.
	DEBUG(dbgs()
	<< "LV: Interleaving is beneficial but is explicitly disabled.");
	IntDiagMsg = std::make_pair(
	"InterleavingBeneficialButDisabled",
	"the cost-model indicates that interleaving is beneficial "
	"but is explicitly disabled or interleave count is set to 1");
	InterleaveLoop = false;
	}

	// Override IC if user provided an interleave count.
	IC = UserIC > 0 ? UserIC : IC;

	// Emit diagnostic messages, if any.
	const char *VAPassName = Hints.vectorizeAnalysisPassName();
	if (!VectorizeLoop && !InterleaveLoop) {
	// Do not vectorize or interleaving the loop.
	ORE->emit([&]() {
	return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
	L->getStartLoc(), L->getHeader())
	<< VecDiagMsg.second;
	});
	ORE->emit([&]() {
	return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
	L->getStartLoc(), L->getHeader())
	<< IntDiagMsg.second;
	});
	return false;
	} else if (!VectorizeLoop && InterleaveLoop) {
	DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
	ORE->emit([&]() {
	return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
	L->getStartLoc(), L->getHeader())
	<< VecDiagMsg.second;
	});
	} else if (VectorizeLoop && !InterleaveLoop) {
	DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
	<< DebugLocStr << '\n');
	ORE->emit([&]() {
	return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
	L->getStartLoc(), L->getHeader())
	<< IntDiagMsg.second;
	});
	} else if (VectorizeLoop && InterleaveLoop) {
	DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
	<< DebugLocStr << '\n');
	DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
	}

	LVP.setBestPlan(VF.Width, IC);

	using namespace ore;

	if (!VectorizeLoop) {
	assert(IC > 1 && "interleave count should not be 1 or 0");
	// If we decided that it is not legal to vectorize the loop, then
	// interleave it.
	InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
	&CM);
	LVP.executePlan(Unroller, DT);

	ORE->emit([&]() {
	return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
	L->getHeader())
	<< "interleaved loop (interleaved count: "
	<< NV("InterleaveCount", IC) << ")";
	});
	} else {
	// If we decided that it is legal to vectorize the loop, then do it.
	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
	&LVL, &CM);
	LVP.executePlan(LB, DT);
	++LoopsVectorized;

	// Add metadata to disable runtime unrolling a scalar loop when there are
	// no runtime checks about strides and memory. A scalar loop that is
	// rarely used is not worth unrolling.
	if (!LB.areSafetyChecksAdded())
	AddRuntimeUnrollDisableMetaData(L);

	// Report the vectorization decision.
	ORE->emit([&]() {
	return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
	L->getHeader())
	<< "vectorized loop (vectorization width: "
	<< NV("VectorizationFactor", VF.Width)
	<< ", interleaved count: " << NV("InterleaveCount", IC) << ")";
	});
	}

	// Mark the loop as already vectorized to avoid vectorizing again.
	Hints.setAlreadyVectorized();

	DEBUG(verifyFunction(*L->getHeader()->getParent()));
	return true;
	}

	bool LoopVectorizePass::runImpl(
	Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
	DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
	DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
	std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
	OptimizationRemarkEmitter &ORE_) {
	SE = &SE_;
	LI = &LI_;
	TTI = &TTI_;
	DT = &DT_;
	BFI = &BFI_;
	TLI = TLI_;
	AA = &AA_;
	AC = &AC_;
	GetLAA = &GetLAA_;
	DB = &DB_;
	ORE = &ORE_;

	// Don't attempt if
	// 1. the target claims to have no vector registers, and
	// 2. interleaving won't help ILP.
	//
	// The second condition is necessary because, even if the target has no
	// vector registers, loop vectorization may still enable scalar
	// interleaving.
	if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
	return false;

	bool Changed = false;

	// The vectorizer requires loops to be in simplified form.
	// Since simplification may add new inner loops, it has to run before the
	// legality and profitability checks. This means running the loop vectorizer
	// will simplify all loops, regardless of whether anything end up being
	// vectorized.
	for (auto &L : *LI)
	Changed \|= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);

	// Build up a worklist of inner-loops to vectorize. This is necessary as
	// the act of vectorizing or partially unrolling a loop creates new loops
	// and can invalidate iterators across the loops.
	SmallVector<Loop *, 8> Worklist;

	for (Loop L : LI)
	addAcyclicInnerLoop(*L, Worklist);

	LoopsAnalyzed += Worklist.size();

	// Now walk the identified inner loops.
	while (!Worklist.empty()) {
	Loop *L = Worklist.pop_back_val();

	// For the inner loops we actually process, form LCSSA to simplify the
	// transform.
	Changed \|= formLCSSARecursively(L, DT, LI, SE);

	Changed \|= processLoop(L);
	}

	// Process each loop nest in the function.
	return Changed;
	}

	PreservedAnalyses LoopVectorizePass::run(Function &F,
	FunctionAnalysisManager &AM) {
	auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
	auto &LI = AM.getResult<LoopAnalysis>(F);
	auto &TTI = AM.getResult<TargetIRAnalysis>(F);
	auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
	auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
	auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
	auto &AA = AM.getResult<AAManager>(F);
	auto &AC = AM.getResult<AssumptionAnalysis>(F);
	auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

	auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
	std::function<const LoopAccessInfo &(Loop &)> GetLAA =
	[&](Loop &L) -> const LoopAccessInfo & {
	LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
	return LAM.getResult<LoopAccessAnalysis>(L, AR);
	};
	bool Changed =
	runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
	if (!Changed)
	return PreservedAnalyses::all();
	PreservedAnalyses PA;
	PA.preserve<LoopAnalysis>();
	PA.preserve<DominatorTreeAnalysis>();
	PA.preserve<BasicAA>();
	PA.preserve<GlobalsAA>();
	return PA;
	}
	Index: vendor/llvm/dist-release_60/lib/Transforms/Vectorize/SLPVectorizer.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 328362)
	@@ -1,6014 +1,5973 @@
	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
	// stores that can be put together into vector-stores. Next, it attempts to
	// construct vectorizable tree using the use-def chains. If a profitable tree
	// was found, the SLP vectorizer performs vectorization on the tree.
	//
	// The pass is inspired by the work described in the paper:
	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/iterator.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/CodeMetrics.h"
	#include "llvm/Analysis/DemandedBits.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/LoopAccessAnalysis.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/NoFolder.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/DOTGraphTraits.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/GraphWriter.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Utils/LoopUtils.h"
	#include "llvm/Transforms/Vectorize.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <memory>
	#include <set>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;
	using namespace slpvectorizer;

	#define SV_NAME "slp-vectorizer"
	#define DEBUG_TYPE "SLP"

	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

	static cl::opt<int>
	SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
	cl::desc("Only vectorize if you gain more than this "
	"number "));

	static cl::opt<bool>
	ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
	cl::desc("Attempt to vectorize horizontal reductions"));

	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
	"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
	cl::desc(
	"Attempt to vectorize horizontal reductions feeding into a store"));

	static cl::opt<int>
	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));

	/// Limits the size of scheduling regions in a block.
	/// It avoid long compile times for _very_ large blocks where vector
	/// instructions are spread over a wide range.
	/// This limit is way higher than needed by real-world functions.
	static cl::opt<int>
	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
	cl::desc("Limit the size of the SLP scheduling region per block"));

	static cl::opt<int> MinVectorRegSizeOption(
	"slp-min-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));

	static cl::opt<unsigned> RecursionMaxDepth(
	"slp-recursion-max-depth", cl::init(12), cl::Hidden,
	cl::desc("Limit the recursion depth when building a vectorizable tree"));

	static cl::opt<unsigned> MinTreeSize(
	"slp-min-tree-size", cl::init(3), cl::Hidden,
	cl::desc("Only vectorize small trees if they are fully vectorizable"));

	static cl::opt<bool>
	ViewSLPTree("view-slp-tree", cl::Hidden,
	cl::desc("Display the SLP trees with Graphviz"));

	// Limit the number of alias checks. The limit is chosen so that
	// it has no negative effect on the llvm benchmarks.
	static const unsigned AliasedCheckLimit = 10;

	// Another limit for the alias checks: The maximum distance between load/store
	// instructions where alias checks are done.
	// This limit is useful for very large basic blocks.
	static const unsigned MaxMemDepDistance = 160;

	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
	/// regions to be handled.
	static const int MinScheduleRegionSize = 16;

	/// \brief Predicate for the element types that the SLP vectorizer supports.
	///
	/// The most important thing to filter here are types which are invalid in LLVM
	/// vectors. We also filter target specific types which have absolutely no
	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
	/// avoids spending time checking the cost model and realizing that they will
	/// be inevitably scalarized.
	static bool isValidElementType(Type *Ty) {
	return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
	!Ty->isPPC_FP128Ty();
	}

	/// \returns true if all of the instructions in \p VL are in the same block or
	/// false otherwise.
	static bool allSameBlock(ArrayRef<Value *> VL) {
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return false;
	BasicBlock *BB = I0->getParent();
	for (int i = 1, e = VL.size(); i < e; i++) {
	Instruction *I = dyn_cast<Instruction>(VL[i]);
	if (!I)
	return false;

	if (BB != I->getParent())
	return false;
	}
	return true;
	}

	/// \returns True if all of the values in \p VL are constants.
	static bool allConstant(ArrayRef<Value *> VL) {
	for (Value *i : VL)
	if (!isa<Constant>(i))
	return false;
	return true;
	}

	/// \returns True if all of the values in \p VL are identical.
	static bool isSplat(ArrayRef<Value *> VL) {
	for (unsigned i = 1, e = VL.size(); i < e; ++i)
	if (VL[i] != VL[0])
	return false;
	return true;
	}

	/// Checks if the vector of instructions can be represented as a shuffle, like:
	/// %x0 = extractelement <4 x i8> %x, i32 0
	/// %x3 = extractelement <4 x i8> %x, i32 3
	/// %y1 = extractelement <4 x i8> %y, i32 1
	/// %y2 = extractelement <4 x i8> %y, i32 2
	/// %x0x0 = mul i8 %x0, %x0
	/// %x3x3 = mul i8 %x3, %x3
	/// %y1y1 = mul i8 %y1, %y1
	/// %y2y2 = mul i8 %y2, %y2
	/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
	/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
	/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
	/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
	/// ret <4 x i8> %ins4
	/// can be transformed into:
	/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
	/// i32 6>
	/// %2 = mul <4 x i8> %1, %1
	/// ret <4 x i8> %2
	/// We convert this initially to something like:
	/// %x0 = extractelement <4 x i8> %x, i32 0
	/// %x3 = extractelement <4 x i8> %x, i32 3
	/// %y1 = extractelement <4 x i8> %y, i32 1
	/// %y2 = extractelement <4 x i8> %y, i32 2
	/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
	/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
	/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
	/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
	/// %5 = mul <4 x i8> %4, %4
	/// %6 = extractelement <4 x i8> %5, i32 0
	/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
	/// %7 = extractelement <4 x i8> %5, i32 1
	/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
	/// %8 = extractelement <4 x i8> %5, i32 2
	/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
	/// %9 = extractelement <4 x i8> %5, i32 3
	/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
	/// ret <4 x i8> %ins4
	/// InstCombiner transforms this into a shuffle and vector mul
	static Optional<TargetTransformInfo::ShuffleKind>
	isShuffle(ArrayRef<Value *> VL) {
	auto *EI0 = cast<ExtractElementInst>(VL[0]);
	unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
	Value *Vec1 = nullptr;
	Value *Vec2 = nullptr;
	enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
	ShuffleMode CommonShuffleMode = Unknown;
	for (unsigned I = 0, E = VL.size(); I < E; ++I) {
	auto *EI = cast<ExtractElementInst>(VL[I]);
	auto *Vec = EI->getVectorOperand();
	// All vector operands must have the same number of vector elements.
	if (Vec->getType()->getVectorNumElements() != Size)
	return None;
	auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
	if (!Idx)
	return None;
	// Undefined behavior if Idx is negative or >= Size.
	if (Idx->getValue().uge(Size))
	continue;
	unsigned IntIdx = Idx->getValue().getZExtValue();
	// We can extractelement from undef vector.
	if (isa<UndefValue>(Vec))
	continue;
	// For correct shuffling we have to have at most 2 different vector operands
	// in all extractelement instructions.
	if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
	return None;
	if (CommonShuffleMode == Permute)
	continue;
	// If the extract index is not the same as the operation number, it is a
	// permutation.
	if (IntIdx != I) {
	CommonShuffleMode = Permute;
	continue;
	}
	// Check the shuffle mode for the current operation.
	if (!Vec1)
	Vec1 = Vec;
	else if (Vec != Vec1)
	Vec2 = Vec;
	// Example: shufflevector A, B, <0,5,2,7>
	// I is odd and IntIdx for A == I - FirstAlternate shuffle.
	// I is even and IntIdx for B == I - FirstAlternate shuffle.
	// Example: shufflevector A, B, <4,1,6,3>
	// I is even and IntIdx for A == I - SecondAlternate shuffle.
	// I is odd and IntIdx for B == I - SecondAlternate shuffle.
	const bool IIsEven = I & 1;
	const bool CurrVecIsA = Vec == Vec1;
	const bool IIsOdd = !IIsEven;
	const bool CurrVecIsB = !CurrVecIsA;
	ShuffleMode CurrentShuffleMode =
	((IIsOdd && CurrVecIsA) \|\| (IIsEven && CurrVecIsB)) ? FirstAlternate
	: SecondAlternate;
	// Common mode is not set or the same as the shuffle mode of the current
	// operation - alternate.
	if (CommonShuffleMode == Unknown)
	CommonShuffleMode = CurrentShuffleMode;
	// Common shuffle mode is not the same as the shuffle mode of the current
	// operation - permutation.
	if (CommonShuffleMode != CurrentShuffleMode)
	CommonShuffleMode = Permute;
	}
	// If we're not crossing lanes in different vectors, consider it as blending.
	if ((CommonShuffleMode == FirstAlternate \|\|
	CommonShuffleMode == SecondAlternate) &&
	Vec2)
	return TargetTransformInfo::SK_Alternate;
	// If Vec2 was never used, we have a permutation of a single vector, otherwise
	// we have permutation of 2 vectors.
	return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
	: TargetTransformInfo::SK_PermuteSingleSrc;
	}

	///\returns Opcode that can be clubbed with \p Op to create an alternate
	/// sequence which can later be merged as a ShuffleVector instruction.
	static unsigned getAltOpcode(unsigned Op) {
	switch (Op) {
	case Instruction::FAdd:
	return Instruction::FSub;
	case Instruction::FSub:
	return Instruction::FAdd;
	case Instruction::Add:
	return Instruction::Sub;
	case Instruction::Sub:
	return Instruction::Add;
	default:
	return 0;
	}
	}

	static bool isOdd(unsigned Value) {
	return Value & 1;
	}

	static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
	unsigned CheckedOpcode) {
	return Opcode == CheckedOpcode \|\| AltOpcode == CheckedOpcode;
	}

	/// Chooses the correct key for scheduling data. If \p Op has the same (or
	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
	/// OpValue.
	static Value isOneOf(Value OpValue, Value *Op) {
	auto *I = dyn_cast<Instruction>(Op);
	if (!I)
	return OpValue;
	auto *OpInst = cast<Instruction>(OpValue);
	unsigned OpInstOpcode = OpInst->getOpcode();
	unsigned IOpcode = I->getOpcode();
	if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
	return Op;
	return OpValue;
	}

	namespace {

	/// Contains data for the instructions going to be vectorized.
	struct RawInstructionsData {
	/// Main Opcode of the instructions going to be vectorized.
	unsigned Opcode = 0;

	/// The list of instructions have some instructions with alternate opcodes.
	bool HasAltOpcodes = false;
	};

	} // end anonymous namespace

	/// Checks the list of the vectorized instructions \p VL and returns info about
	/// this list.
	static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
	auto *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return {};
	RawInstructionsData Res;
	unsigned Opcode = I0->getOpcode();
	// Walk through the list of the vectorized instructions
	// in order to check its structure described by RawInstructionsData.
	for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
	auto *I = dyn_cast<Instruction>(VL[Cnt]);
	if (!I)
	return {};
	if (Opcode != I->getOpcode())
	Res.HasAltOpcodes = true;
	}
	Res.Opcode = Opcode;
	return Res;
	}

	namespace {

	/// Main data required for vectorization of instructions.
	struct InstructionsState {
	/// The very first instruction in the list with the main opcode.
	Value *OpValue = nullptr;

	/// The main opcode for the list of instructions.
	unsigned Opcode = 0;

	/// Some of the instructions in the list have alternate opcodes.
	bool IsAltShuffle = false;

	InstructionsState() = default;
	InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
	: OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
	};

	} // end anonymous namespace

	/// \returns analysis of the Instructions in \p VL described in
	/// InstructionsState, the Opcode that we suppose the whole list
	/// could be vectorized even if its structure is diverse.
	static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
	auto Res = getMainOpcode(VL);
	unsigned Opcode = Res.Opcode;
	if (!Res.HasAltOpcodes)
	return InstructionsState(VL[0], Opcode, false);
	auto *OpInst = cast<Instruction>(VL[0]);
	unsigned AltOpcode = getAltOpcode(Opcode);
	// Examine each element in the list instructions VL to determine
	// if some operations there could be considered as an alternative
	// (for example as subtraction relates to addition operation).
	for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
	auto *I = cast<Instruction>(VL[Cnt]);
	unsigned InstOpcode = I->getOpcode();
	if ((Res.HasAltOpcodes &&
	InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) \|\|
	(!Res.HasAltOpcodes && InstOpcode != Opcode)) {
	return InstructionsState(OpInst, 0, false);
	}
	}
	return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
	}

	/// \returns true if all of the values in \p VL have the same type or false
	/// otherwise.
	static bool allSameType(ArrayRef<Value *> VL) {
	Type *Ty = VL[0]->getType();
	for (int i = 1, e = VL.size(); i < e; i++)
	if (VL[i]->getType() != Ty)
	return false;

	return true;
	}

	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
	static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
	assert(Opcode == Instruction::ExtractElement \|\|
	Opcode == Instruction::ExtractValue);
	if (Opcode == Instruction::ExtractElement) {
	ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
	return CI && CI->getZExtValue() == Idx;
	} else {
	ExtractValueInst *EI = cast<ExtractValueInst>(E);
	return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
	}
	}

	/// \returns True if in-tree use also needs extract. This refers to
	/// possible scalar operand in vectorized instruction.
	static bool InTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
	TargetLibraryInfo *TLI) {
	unsigned Opcode = UserInst->getOpcode();
	switch (Opcode) {
	case Instruction::Load: {
	LoadInst *LI = cast<LoadInst>(UserInst);
	return (LI->getPointerOperand() == Scalar);
	}
	case Instruction::Store: {
	StoreInst *SI = cast<StoreInst>(UserInst);
	return (SI->getPointerOperand() == Scalar);
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(UserInst);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	if (hasVectorInstrinsicScalarOpd(ID, 1)) {
	return (CI->getArgOperand(1) == Scalar);
	}
	LLVM_FALLTHROUGH;
	}
	default:
	return false;
	}
	}

	/// \returns the AA location that is being access by the instruction.
	static MemoryLocation getLocation(Instruction I, AliasAnalysis AA) {
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return MemoryLocation::get(SI);
	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	return MemoryLocation::get(LI);
	return MemoryLocation();
	}

	/// \returns True if the instruction is not a volatile or atomic load/store.
	static bool isSimple(Instruction *I) {
	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	return LI->isSimple();
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return SI->isSimple();
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
	return !MI->isVolatile();
	return true;
	}

	namespace llvm {

	namespace slpvectorizer {

	/// Bottom Up SLP Vectorizer.
	class BoUpSLP {
	public:
	using ValueList = SmallVector<Value *, 8>;
	using InstrList = SmallVector<Instruction *, 16>;
	using ValueSet = SmallPtrSet<Value *, 16>;
	using StoreList = SmallVector<StoreInst *, 8>;
	using ExtraValueToDebugLocsMap =
	MapVector<Value , SmallVector<Instruction , 2>>;

	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
	TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,
	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
	const DataLayout DL, OptimizationRemarkEmitter ORE)
	: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
	DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
	CodeMetrics::collectEphemeralValues(F, AC, EphValues);
	// Use the vector register size specified by the target unless overridden
	// by a command-line option.
	// TODO: It would be better to limit the vectorization factor based on
	// data type rather than just register size. For example, x86 AVX has
	// 256-bit registers, but it does not support integer operations
	// at that width (that requires AVX2).
	if (MaxVectorRegSizeOption.getNumOccurrences())
	MaxVecRegSize = MaxVectorRegSizeOption;
	else
	MaxVecRegSize = TTI->getRegisterBitWidth(true);

	if (MinVectorRegSizeOption.getNumOccurrences())
	MinVecRegSize = MinVectorRegSizeOption;
	else
	MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
	}

	/// \brief Vectorize the tree that starts with the elements in \p VL.
	/// Returns the vectorized root.
	Value *vectorizeTree();

	/// Vectorize the tree but with the list of externally used values \p
	/// ExternallyUsedValues. Values in this MapVector can be replaced but the
	/// generated extractvalue instructions.
	Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);

	/// \returns the cost incurred by unwanted spills and fills, caused by
	/// holding live values over call sites.
	int getSpillCost();

	/// \returns the vectorization cost of the subtree that starts at \p VL.
	/// A negative number means that this is profitable.
	int getTreeCost();

	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
	void buildTree(ArrayRef<Value *> Roots,
	ArrayRef<Value *> UserIgnoreLst = None);

	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
	/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
	/// into account (anf updating it, if required) list of externally used
	/// values stored in \p ExternallyUsedValues.
	void buildTree(ArrayRef<Value *> Roots,
	ExtraValueToDebugLocsMap &ExternallyUsedValues,
	ArrayRef<Value *> UserIgnoreLst = None);

	/// Clear the internal data structures that are created by 'buildTree'.
	void deleteTree() {
	VectorizableTree.clear();
	ScalarToTreeEntry.clear();
	MustGather.clear();
	ExternalUses.clear();
	NumLoadsWantToKeepOrder = 0;
	NumLoadsWantToChangeOrder = 0;
	for (auto &Iter : BlocksSchedules) {
	BlockScheduling *BS = Iter.second.get();
	BS->clear();
	}
	MinBWs.clear();
	}

	unsigned getTreeSize() const { return VectorizableTree.size(); }

	/// \brief Perform LICM and CSE on the newly generated gather sequences.
	void optimizeGatherSequence(Function &F);

	/// \returns true if it is beneficial to reverse the vector order.
	bool shouldReorder() const {
	return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
	}

	/// \return The vector element size in bits to use when vectorizing the
	/// expression tree ending at \p V. If V is a store, the size is the width of
	/// the stored value. Otherwise, the size is the width of the largest loaded
	/// value reaching V. This method is used by the vectorizer to calculate
	/// vectorization factors.
	unsigned getVectorElementSize(Value *V);

	/// Compute the minimum type sizes required to represent the entries in a
	/// vectorizable tree.
	void computeMinimumValueSizes();

	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
	unsigned getMaxVecRegSize() const {
	return MaxVecRegSize;
	}

	// \returns minimum vector register size as set by cl::opt.
	unsigned getMinVecRegSize() const {
	return MinVecRegSize;
	}

	/// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
	///
	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
	unsigned canMapToVector(Type *T, const DataLayout &DL) const;

	/// \returns True if the VectorizableTree is both tiny and not fully
	/// vectorizable. We do not vectorize such trees.
	bool isTreeTinyAndNotFullyVectorizable();

	OptimizationRemarkEmitter *getORE() { return ORE; }

	private:
	struct TreeEntry;

	/// Checks if all users of \p I are the part of the vectorization tree.
	bool areAllUsersVectorized(Instruction *I) const;

	/// \returns the cost of the vectorizable entry.
	int getEntryCost(TreeEntry *E);

	/// This is the recursive part of buildTree.
	void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);

	/// \returns True if the ExtractElement/ExtractValue instructions in VL can
	/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
	bool canReuseExtract(ArrayRef<Value > VL, Value OpValue) const;

	/// Vectorize a single entry in the tree.
	Value vectorizeTree(TreeEntry E);

	/// Vectorize a single entry in the tree, starting in \p VL.
	Value vectorizeTree(ArrayRef<Value > VL);

	/// \returns the pointer to the vectorized value if \p VL is already
	/// vectorized, or NULL. They may happen in cycles.
	Value alreadyVectorized(ArrayRef<Value > VL, Value *OpValue) const;

	/// \returns the scalarization cost for this type. Scalarization in this
	/// context means the creation of vectors from a group of scalars.
	int getGatherCost(Type *Ty);

	/// \returns the scalarization cost for this list of values. Assuming that
	/// this subtree gets vectorized, we may need to extract the values from the
	/// roots. This method calculates the cost of extracting the values.
	int getGatherCost(ArrayRef<Value *> VL);

	/// \brief Set the Builder insert point to one after the last instruction in
	/// the bundle
	void setInsertPointAfterBundle(ArrayRef<Value > VL, Value OpValue);

	/// \returns a vector from a collection of scalars in \p VL.
	Value Gather(ArrayRef<Value > VL, VectorType *Ty);

	/// \returns whether the VectorizableTree is fully vectorizable and will
	/// be beneficial even the tree height is tiny.
	bool isFullyVectorizableTinyTree();

	/// \reorder commutative operands in alt shuffle if they result in
	/// vectorized code.
	void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right);

	/// \reorder commutative operands to get better probability of
	/// generating vectorized code.
	void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right);
	struct TreeEntry {
	TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}

	/// \returns true if the scalars in VL are equal to this entry.
	bool isSame(ArrayRef<Value *> VL) const {
	assert(VL.size() == Scalars.size() && "Invalid size");
	return std::equal(VL.begin(), VL.end(), Scalars.begin());
	}

	/// A vector of scalars.
	ValueList Scalars;

	/// The Scalars are vectorized into this value. It is initialized to Null.
	Value *VectorizedValue = nullptr;

	/// Do we need to gather this sequence ?
	bool NeedToGather = false;

	/// Points back to the VectorizableTree.
	///
	/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
	/// to be a pointer and needs to be able to initialize the child iterator.
	/// Thus we need a reference back to the container to translate the indices
	/// to entries.
	std::vector<TreeEntry> &Container;

	/// The TreeEntry index containing the user of this entry. We can actually
	/// have multiple users so the data structure is not truly a tree.
	SmallVector<int, 1> UserTreeIndices;
	};

	/// Create a new VectorizableTree entry.
	TreeEntry newTreeEntry(ArrayRef<Value > VL, bool Vectorized,
	int &UserTreeIdx) {
	VectorizableTree.emplace_back(VectorizableTree);
	int idx = VectorizableTree.size() - 1;
	TreeEntry *Last = &VectorizableTree[idx];
	Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
	Last->NeedToGather = !Vectorized;
	if (Vectorized) {
	for (int i = 0, e = VL.size(); i != e; ++i) {
	assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
	ScalarToTreeEntry[VL[i]] = idx;
	}
	} else {
	MustGather.insert(VL.begin(), VL.end());
	}

	if (UserTreeIdx >= 0)
	Last->UserTreeIndices.push_back(UserTreeIdx);
	UserTreeIdx = idx;
	return Last;
	}

	/// -- Vectorization State --
	/// Holds all of the tree entries.
	std::vector<TreeEntry> VectorizableTree;

	TreeEntry getTreeEntry(Value V) {
	auto I = ScalarToTreeEntry.find(V);
	if (I != ScalarToTreeEntry.end())
	return &VectorizableTree[I->second];
	return nullptr;
	}

	const TreeEntry getTreeEntry(Value V) const {
	auto I = ScalarToTreeEntry.find(V);
	if (I != ScalarToTreeEntry.end())
	return &VectorizableTree[I->second];
	return nullptr;
	}

	/// Maps a specific scalar to its tree entry.
	SmallDenseMap<Value*, int> ScalarToTreeEntry;

	/// A list of scalars that we found that we need to keep as scalars.
	ValueSet MustGather;

	/// This POD struct describes one external user in the vectorized tree.
	struct ExternalUser {
	ExternalUser(Value S, llvm::User U, int L)
	: Scalar(S), User(U), Lane(L) {}

	// Which scalar in our function.
	Value *Scalar;

	// Which user that uses the scalar.
	llvm::User *User;

	// Which lane does the scalar belong to.
	int Lane;
	};
	using UserList = SmallVector<ExternalUser, 16>;

	/// Checks if two instructions may access the same memory.
	///
	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
	/// is invariant in the calling loop.
	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
	Instruction *Inst2) {
	// First check if the result is already in the cache.
	AliasCacheKey key = std::make_pair(Inst1, Inst2);
	Optional<bool> &result = AliasCache[key];
	if (result.hasValue()) {
	return result.getValue();
	}
	MemoryLocation Loc2 = getLocation(Inst2, AA);
	bool aliased = true;
	if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
	// Do the alias check.
	aliased = AA->alias(Loc1, Loc2);
	}
	// Store the result in the cache.
	result = aliased;
	return aliased;
	}

	using AliasCacheKey = std::pair<Instruction , Instruction >;

	/// Cache for alias results.
	/// TODO: consider moving this to the AliasAnalysis itself.
	DenseMap<AliasCacheKey, Optional<bool>> AliasCache;

	/// Removes an instruction from its block and eventually deletes it.
	/// It's like Instruction::eraseFromParent() except that the actual deletion
	/// is delayed until BoUpSLP is destructed.
	/// This is required to ensure that there are no incorrect collisions in the
	/// AliasCache, which can happen if a new instruction is allocated at the
	/// same address as a previously deleted instruction.
	void eraseInstruction(Instruction *I) {
	I->removeFromParent();
	I->dropAllReferences();
	DeletedInstructions.emplace_back(I);
	}

	/// Temporary store for deleted instructions. Instructions will be deleted
	/// eventually when the BoUpSLP is destructed.
	SmallVector<unique_value, 8> DeletedInstructions;

	/// A list of values that need to extracted out of the tree.
	/// This list holds pairs of (Internal Scalar : External User). External User
	/// can be nullptr, it means that this Internal Scalar will be used later,
	/// after vectorization.
	UserList ExternalUses;

	/// Values used only by @llvm.assume calls.
	SmallPtrSet<const Value *, 32> EphValues;

	/// Holds all of the instructions that we gathered.
	SetVector<Instruction *> GatherSeq;

	/// A list of blocks that we are going to CSE.
	SetVector<BasicBlock *> CSEBlocks;

	/// Contains all scheduling relevant data for an instruction.
	/// A ScheduleData either represents a single instruction or a member of an
	/// instruction bundle (= a group of instructions which is combined into a
	/// vector instruction).
	struct ScheduleData {
	// The initial value for the dependency counters. It means that the
	// dependencies are not calculated yet.
	enum { InvalidDeps = -1 };

	ScheduleData() = default;

	void init(int BlockSchedulingRegionID, Value *OpVal) {
	FirstInBundle = this;
	NextInBundle = nullptr;
	NextLoadStore = nullptr;
	IsScheduled = false;
	SchedulingRegionID = BlockSchedulingRegionID;
	UnscheduledDepsInBundle = UnscheduledDeps;
	clearDependencies();
	OpValue = OpVal;
	}

	/// Returns true if the dependency information has been calculated.
	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }

	/// Returns true for single instructions and for bundle representatives
	/// (= the head of a bundle).
	bool isSchedulingEntity() const { return FirstInBundle == this; }

	/// Returns true if it represents an instruction bundle and not only a
	/// single instruction.
	bool isPartOfBundle() const {
	return NextInBundle != nullptr \|\| FirstInBundle != this;
	}

	/// Returns true if it is ready for scheduling, i.e. it has no more
	/// unscheduled depending instructions/bundles.
	bool isReady() const {
	assert(isSchedulingEntity() &&
	"can't consider non-scheduling entity for ready list");
	return UnscheduledDepsInBundle == 0 && !IsScheduled;
	}

	/// Modifies the number of unscheduled dependencies, also updating it for
	/// the whole bundle.
	int incrementUnscheduledDeps(int Incr) {
	UnscheduledDeps += Incr;
	return FirstInBundle->UnscheduledDepsInBundle += Incr;
	}

	/// Sets the number of unscheduled dependencies to the number of
	/// dependencies.
	void resetUnscheduledDeps() {
	incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
	}

	/// Clears all dependency information.
	void clearDependencies() {
	Dependencies = InvalidDeps;
	resetUnscheduledDeps();
	MemoryDependencies.clear();
	}

	void dump(raw_ostream &os) const {
	if (!isSchedulingEntity()) {
	os << "/ " << *Inst;
	} else if (NextInBundle) {
	os << '[' << *Inst;
	ScheduleData *SD = NextInBundle;
	while (SD) {
	os << ';' << *SD->Inst;
	SD = SD->NextInBundle;
	}
	os << ']';
	} else {
	os << *Inst;
	}
	}

	Instruction *Inst = nullptr;

	/// Points to the head in an instruction bundle (and always to this for
	/// single instructions).
	ScheduleData *FirstInBundle = nullptr;

	/// Single linked list of all instructions in a bundle. Null if it is a
	/// single instruction.
	ScheduleData *NextInBundle = nullptr;

	/// Single linked list of all memory instructions (e.g. load, store, call)
	/// in the block - until the end of the scheduling region.
	ScheduleData *NextLoadStore = nullptr;

	/// The dependent memory instructions.
	/// This list is derived on demand in calculateDependencies().
	SmallVector<ScheduleData *, 4> MemoryDependencies;

	/// This ScheduleData is in the current scheduling region if this matches
	/// the current SchedulingRegionID of BlockScheduling.
	int SchedulingRegionID = 0;

	/// Used for getting a "good" final ordering of instructions.
	int SchedulingPriority = 0;

	/// The number of dependencies. Constitutes of the number of users of the
	/// instruction plus the number of dependent memory instructions (if any).
	/// This value is calculated on demand.
	/// If InvalidDeps, the number of dependencies is not calculated yet.
	int Dependencies = InvalidDeps;

	/// The number of dependencies minus the number of dependencies of scheduled
	/// instructions. As soon as this is zero, the instruction/bundle gets ready
	/// for scheduling.
	/// Note that this is negative as long as Dependencies is not calculated.
	int UnscheduledDeps = InvalidDeps;

	/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
	/// single instructions.
	int UnscheduledDepsInBundle = InvalidDeps;

	/// True if this instruction is scheduled (or considered as scheduled in the
	/// dry-run).
	bool IsScheduled = false;

	/// Opcode of the current instruction in the schedule data.
	Value *OpValue = nullptr;
	};

	#ifndef NDEBUG
	friend inline raw_ostream &operator<<(raw_ostream &os,
	const BoUpSLP::ScheduleData &SD) {
	SD.dump(os);
	return os;
	}
	#endif

	friend struct GraphTraits<BoUpSLP *>;
	friend struct DOTGraphTraits<BoUpSLP *>;

	/// Contains all scheduling data for a basic block.
	struct BlockScheduling {
	BlockScheduling(BasicBlock *BB)
	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}

	void clear() {
	ReadyInsts.clear();
	ScheduleStart = nullptr;
	ScheduleEnd = nullptr;
	FirstLoadStoreInRegion = nullptr;
	LastLoadStoreInRegion = nullptr;

	// Reduce the maximum schedule region size by the size of the
	// previous scheduling run.
	ScheduleRegionSizeLimit -= ScheduleRegionSize;
	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
	ScheduleRegionSizeLimit = MinScheduleRegionSize;
	ScheduleRegionSize = 0;

	// Make a new scheduling region, i.e. all existing ScheduleData is not
	// in the new region yet.
	++SchedulingRegionID;
	}

	ScheduleData getScheduleData(Value V) {
	ScheduleData *SD = ScheduleDataMap[V];
	if (SD && SD->SchedulingRegionID == SchedulingRegionID)
	return SD;
	return nullptr;
	}

	ScheduleData getScheduleData(Value V, Value *Key) {
	if (V == Key)
	return getScheduleData(V);
	auto I = ExtraScheduleDataMap.find(V);
	if (I != ExtraScheduleDataMap.end()) {
	ScheduleData *SD = I->second[Key];
	if (SD && SD->SchedulingRegionID == SchedulingRegionID)
	return SD;
	}
	return nullptr;
	}

	bool isInSchedulingRegion(ScheduleData *SD) {
	return SD->SchedulingRegionID == SchedulingRegionID;
	}

	/// Marks an instruction as scheduled and puts all dependent ready
	/// instructions into the ready-list.
	template <typename ReadyListType>
	void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
	SD->IsScheduled = true;
	DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");

	ScheduleData *BundleMember = SD;
	while (BundleMember) {
	if (BundleMember->Inst != BundleMember->OpValue) {
	BundleMember = BundleMember->NextInBundle;
	continue;
	}
	// Handle the def-use chain dependencies.
	for (Use &U : BundleMember->Inst->operands()) {
	auto *I = dyn_cast<Instruction>(U.get());
	if (!I)
	continue;
	doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
	if (OpDef && OpDef->hasValidDependencies() &&
	OpDef->incrementUnscheduledDeps(-1) == 0) {
	// There are no more unscheduled dependencies after
	// decrementing, so we can put the dependent instruction
	// into the ready list.
	ScheduleData *DepBundle = OpDef->FirstInBundle;
	assert(!DepBundle->IsScheduled &&
	"already scheduled bundle gets ready");
	ReadyList.insert(DepBundle);
	DEBUG(dbgs()
	<< "SLP: gets ready (def): " << *DepBundle << "\n");
	}
	});
	}
	// Handle the memory dependencies.
	for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
	if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
	// There are no more unscheduled dependencies after decrementing,
	// so we can put the dependent instruction into the ready list.
	ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
	assert(!DepBundle->IsScheduled &&
	"already scheduled bundle gets ready");
	ReadyList.insert(DepBundle);
	DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
	<< "\n");
	}
	}
	BundleMember = BundleMember->NextInBundle;
	}
	}

	void doForAllOpcodes(Value *V,
	function_ref<void(ScheduleData *SD)> Action) {
	if (ScheduleData *SD = getScheduleData(V))
	Action(SD);
	auto I = ExtraScheduleDataMap.find(V);
	if (I != ExtraScheduleDataMap.end())
	for (auto &P : I->second)
	if (P.second->SchedulingRegionID == SchedulingRegionID)
	Action(P.second);
	}

	/// Put all instructions into the ReadyList which are ready for scheduling.
	template <typename ReadyListType>
	void initialFillReadyList(ReadyListType &ReadyList) {
	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	doForAllOpcodes(I, [&](ScheduleData *SD) {
	if (SD->isSchedulingEntity() && SD->isReady()) {
	ReadyList.insert(SD);
	DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
	}
	});
	}
	}

	/// Checks if a bundle of instructions can be scheduled, i.e. has no
	/// cyclic dependencies. This is only a dry-run, no instructions are
	/// actually moved at this stage.
	bool tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP, Value *OpValue);

	/// Un-bundles a group of instructions.
	void cancelScheduling(ArrayRef<Value > VL, Value OpValue);

	/// Allocates schedule data chunk.
	ScheduleData *allocateScheduleDataChunks();

	/// Extends the scheduling region so that V is inside the region.
	/// \returns true if the region size is within the limit.
	bool extendSchedulingRegion(Value V, Value OpValue);

	/// Initialize the ScheduleData structures for new instructions in the
	/// scheduling region.
	void initScheduleData(Instruction FromI, Instruction ToI,
	ScheduleData *PrevLoadStore,
	ScheduleData *NextLoadStore);

	/// Updates the dependency information of a bundle and of all instructions/
	/// bundles which depend on the original bundle.
	void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
	BoUpSLP *SLP);

	/// Sets all instruction in the scheduling region to un-scheduled.
	void resetSchedule();

	BasicBlock *BB;

	/// Simple memory allocation for ScheduleData.
	std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

	/// The size of a ScheduleData array in ScheduleDataChunks.
	int ChunkSize;

	/// The allocator position in the current chunk, which is the last entry
	/// of ScheduleDataChunks.
	int ChunkPos;

	/// Attaches ScheduleData to Instruction.
	/// Note that the mapping survives during all vectorization iterations, i.e.
	/// ScheduleData structures are recycled.
	DenseMap<Value , ScheduleData > ScheduleDataMap;

	/// Attaches ScheduleData to Instruction with the leading key.
	DenseMap<Value , SmallDenseMap<Value , ScheduleData *>>
	ExtraScheduleDataMap;

	struct ReadyList : SmallVector<ScheduleData *, 8> {
	void insert(ScheduleData *SD) { push_back(SD); }
	};

	/// The ready-list for scheduling (only used for the dry-run).
	ReadyList ReadyInsts;

	/// The first instruction of the scheduling region.
	Instruction *ScheduleStart = nullptr;

	/// The first instruction _after_ the scheduling region.
	Instruction *ScheduleEnd = nullptr;

	/// The first memory accessing instruction in the scheduling region
	/// (can be null).
	ScheduleData *FirstLoadStoreInRegion = nullptr;

	/// The last memory accessing instruction in the scheduling region
	/// (can be null).
	ScheduleData *LastLoadStoreInRegion = nullptr;

	/// The current size of the scheduling region.
	int ScheduleRegionSize = 0;

	/// The maximum size allowed for the scheduling region.
	int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;

	/// The ID of the scheduling region. For a new vectorization iteration this
	/// is incremented which "removes" all ScheduleData from the region.
	// Make sure that the initial SchedulingRegionID is greater than the
	// initial SchedulingRegionID in ScheduleData (which is 0).
	int SchedulingRegionID = 1;
	};

	/// Attaches the BlockScheduling structures to basic blocks.
	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

	/// Performs the "real" scheduling. Done before vectorization is actually
	/// performed in a basic block.
	void scheduleBlock(BlockScheduling *BS);

	/// List of users to ignore during scheduling and that don't need extracting.
	ArrayRef<Value *> UserIgnoreList;

	// Number of load bundles that contain consecutive loads.
	int NumLoadsWantToKeepOrder = 0;

	// Number of load bundles that contain consecutive loads in reversed order.
	int NumLoadsWantToChangeOrder = 0;

	// Analysis and block reference.
	Function *F;
	ScalarEvolution *SE;
	TargetTransformInfo *TTI;
	TargetLibraryInfo *TLI;
	AliasAnalysis *AA;
	LoopInfo *LI;
	DominatorTree *DT;
	AssumptionCache *AC;
	DemandedBits *DB;
	const DataLayout *DL;
	OptimizationRemarkEmitter *ORE;

	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
	unsigned MinVecRegSize; // Set by cl::opt (default: 128).

	/// Instruction builder to construct the vectorized tree.
	IRBuilder<> Builder;

	/// A map of scalar integer values to the smallest bit width with which they
	/// can legally be represented. The values map to (width, signed) pairs,
	/// where "width" indicates the minimum bit width and "signed" is True if the
	/// value must be signed-extended, rather than zero-extended, back to its
	/// original width.
	MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
	};

	} // end namespace slpvectorizer

	template <> struct GraphTraits<BoUpSLP *> {
	using TreeEntry = BoUpSLP::TreeEntry;

	/// NodeRef has to be a pointer per the GraphWriter.
	using NodeRef = TreeEntry *;

	/// \brief Add the VectorizableTree to the index iterator to be able to return
	/// TreeEntry pointers.
	struct ChildIteratorType
	: public iterator_adaptor_base<ChildIteratorType,
	SmallVector<int, 1>::iterator> {
	std::vector<TreeEntry> &VectorizableTree;

	ChildIteratorType(SmallVector<int, 1>::iterator W,
	std::vector<TreeEntry> &VT)
	: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

	NodeRef operator() { return &VectorizableTree[I]; }
	};

	static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }

	static ChildIteratorType child_begin(NodeRef N) {
	return {N->UserTreeIndices.begin(), N->Container};
	}

	static ChildIteratorType child_end(NodeRef N) {
	return {N->UserTreeIndices.end(), N->Container};
	}

	/// For the node iterator we just need to turn the TreeEntry iterator into a
	/// TreeEntry* iterator so that it dereferences to NodeRef.
	using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;

	static nodes_iterator nodes_begin(BoUpSLP *R) {
	return nodes_iterator(R->VectorizableTree.begin());
	}

	static nodes_iterator nodes_end(BoUpSLP *R) {
	return nodes_iterator(R->VectorizableTree.end());
	}

	static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
	};

	template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
	using TreeEntry = BoUpSLP::TreeEntry;

	DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

	std::string getNodeLabel(const TreeEntry Entry, const BoUpSLP R) {
	std::string Str;
	raw_string_ostream OS(Str);
	if (isSplat(Entry->Scalars)) {
	OS << "<splat> " << *Entry->Scalars[0];
	return Str;
	}
	for (auto V : Entry->Scalars) {
	OS << *V;
	if (std::any_of(
	R->ExternalUses.begin(), R->ExternalUses.end(),
	[&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
	OS << " <extract>";
	OS << "\n";
	}
	return Str;
	}

	static std::string getNodeAttributes(const TreeEntry *Entry,
	const BoUpSLP *) {
	if (Entry->NeedToGather)
	return "color=red";
	return "";
	}
	};

	} // end namespace llvm

	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
	ArrayRef<Value *> UserIgnoreLst) {
	ExtraValueToDebugLocsMap ExternallyUsedValues;
	buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
	}

	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
	ExtraValueToDebugLocsMap &ExternallyUsedValues,
	ArrayRef<Value *> UserIgnoreLst) {
	deleteTree();
	UserIgnoreList = UserIgnoreLst;
	if (!allSameType(Roots))
	return;
	buildTree_rec(Roots, 0, -1);

	// Collect the values that we need to extract from the tree.
	for (TreeEntry &EIdx : VectorizableTree) {
	TreeEntry *Entry = &EIdx;

	// No need to handle users of gathered values.
	if (Entry->NeedToGather)
	continue;

	// For each lane:
	for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
	Value *Scalar = Entry->Scalars[Lane];

	// Check if the scalar is externally used as an extra arg.
	auto ExtI = ExternallyUsedValues.find(Scalar);
	if (ExtI != ExternallyUsedValues.end()) {
	DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
	Lane << " from " << *Scalar << ".\n");
	ExternalUses.emplace_back(Scalar, nullptr, Lane);
	- continue;
	}
	for (User *U : Scalar->users()) {
	DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");

	Instruction *UserInst = dyn_cast<Instruction>(U);
	if (!UserInst)
	continue;

	// Skip in-tree scalars that become vectors
	if (TreeEntry *UseEntry = getTreeEntry(U)) {
	Value *UseScalar = UseEntry->Scalars[0];
	// Some in-tree scalars will remain as scalar in vectorized
	// instructions. If that is the case, the one in Lane 0 will
	// be used.
	if (UseScalar != U \|\|
	!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
	DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
	<< ".\n");
	assert(!UseEntry->NeedToGather && "Bad state");
	continue;
	}
	}

	// Ignore users in the user ignore list.
	if (is_contained(UserIgnoreList, UserInst))
	continue;

	DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
	Lane << " from " << *Scalar << ".\n");
	ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
	}
	}
	}
	}

	void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
	int UserTreeIdx) {
	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");

	InstructionsState S = getSameOpcode(VL);
	if (Depth == RecursionMaxDepth) {
	DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}

	// Don't handle vectors.
	if (S.OpValue->getType()->isVectorTy()) {
	DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
	if (SI->getValueOperand()->getType()->isVectorTy()) {
	DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}

	// If all of the operands are identical or constant we have a simple solution.
	if (allConstant(VL) \|\| isSplat(VL) \|\| !allSameBlock(VL) \|\| !S.Opcode) {
	DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}

	// We now know that this is a vector of instructions of the same type from
	// the same block.

	// Don't vectorize ephemeral values.
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	if (EphValues.count(VL[i])) {
	DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
	") is ephemeral.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	// Check if this is a duplicate of another entry.
	if (TreeEntry *E = getTreeEntry(S.OpValue)) {
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
	if (E->Scalars[i] != VL[i]) {
	DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}
	// Record the reuse of the tree node. FIXME, currently this is only used to
	// properly draw the graph rather than for the actual vectorization.
	E->UserTreeIndices.push_back(UserTreeIdx);
	DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
	return;
	}

	// Check that none of the instructions in the bundle are already in the tree.
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	auto *I = dyn_cast<Instruction>(VL[i]);
	if (!I)
	continue;
	if (getTreeEntry(I)) {
	DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
	") is already in tree.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	// If any of the scalars is marked as a value that needs to stay scalar, then
	// we need to gather the scalars.
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	if (MustGather.count(VL[i])) {
	DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	// Check that all of the users of the scalars that we want to vectorize are
	// schedulable.
	auto *VL0 = cast<Instruction>(S.OpValue);
	BasicBlock *BB = VL0->getParent();

	if (!DT->isReachableFromEntry(BB)) {
	// Don't go into unreachable blocks. They may contain instructions with
	// dependency cycles which confuse the final scheduling.
	DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}

	// Check that every instruction appears once in this bundle.
	for (unsigned i = 0, e = VL.size(); i < e; ++i)
	for (unsigned j = i + 1; j < e; ++j)
	if (VL[i] == VL[j]) {
	DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}

	auto &BSRef = BlocksSchedules[BB];
	if (!BSRef)
	BSRef = llvm::make_unique<BlockScheduling>(BB);

	BlockScheduling &BS = *BSRef.get();

	if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
	DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
	assert((!BS.getScheduleData(VL0) \|\|
	!BS.getScheduleData(VL0)->isPartOfBundle()) &&
	"tryScheduleBundle should cancelScheduling on failure");
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");

	unsigned ShuffleOrOp = S.IsAltShuffle ?
	(unsigned) Instruction::ShuffleVector : S.Opcode;
	switch (ShuffleOrOp) {
	case Instruction::PHI: {
	PHINode *PH = dyn_cast<PHINode>(VL0);

	// Check for terminator values (e.g. invoke).
	for (unsigned j = 0; j < VL.size(); ++j)
	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	TerminatorInst *Term = dyn_cast<TerminatorInst>(
	cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
	if (Term) {
	DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
	PH->getIncomingBlock(i)));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;
	}
	case Instruction::ExtractValue:
	case Instruction::ExtractElement: {
	bool Reuse = canReuseExtract(VL, VL0);
	if (Reuse) {
	DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
	} else {
	BS.cancelScheduling(VL, VL0);
	}
	newTreeEntry(VL, Reuse, UserTreeIdx);
	return;
	}
	case Instruction::Load: {
	// Check that a vectorized load would load the same memory as a scalar
	// load. For example, we don't want to vectorize loads that are smaller
	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
	// from such a struct, we read/write packed bits disagreeing with the
	// unvectorized version.
	Type *ScalarTy = VL0->getType();

	if (DL->getTypeSizeInBits(ScalarTy) !=
	DL->getTypeAllocSizeInBits(ScalarTy)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
	return;
	}

	// Make sure all loads in the bundle are simple - we can't vectorize
	// atomic or volatile loads.
	for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
	LoadInst *L = cast<LoadInst>(VL[i]);
	if (!L->isSimple()) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
	return;
	}
	}

	// Check if the loads are consecutive, reversed, or neither.
	// TODO: What we really want is to sort the loads, but for now, check
	// the two likely directions.
	bool Consecutive = true;
	bool ReverseConsecutive = true;
	for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
	if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, SE)) {
	Consecutive = false;
	break;
	} else {
	ReverseConsecutive = false;
	}
	}

	if (Consecutive) {
	++NumLoadsWantToKeepOrder;
	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of loads.\n");
	return;
	}

	// If none of the load pairs were consecutive when checked in order,
	// check the reverse order.
	if (ReverseConsecutive)
	for (unsigned i = VL.size() - 1; i > 0; --i)
	if (!isConsecutiveAccess(VL[i], VL[i - 1], DL, SE)) {
	ReverseConsecutive = false;
	break;
	}

	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);

	if (ReverseConsecutive) {
	++NumLoadsWantToChangeOrder;
	DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
	} else {
	DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
	}
	return;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	Type *SrcTy = VL0->getOperand(0)->getType();
	for (unsigned i = 0; i < VL.size(); ++i) {
	Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
	return;
	}
	}
	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of casts.\n");

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;
	}
	case Instruction::ICmp:
	case Instruction::FCmp: {
	// Check that all of the compares have the same predicate.
	CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
	Type *ComparedTy = VL0->getOperand(0)->getType();
	for (unsigned i = 1, e = VL.size(); i < e; ++i) {
	CmpInst *Cmp = cast<CmpInst>(VL[i]);
	if (Cmp->getPredicate() != P0 \|\|
	Cmp->getOperand(0)->getType() != ComparedTy) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
	return;
	}
	}

	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of compares.\n");

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;
	}
	case Instruction::Select:
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of bin op.\n");

	// Sort operands of the instructions so that each side is more likely to
	// have the same opcode.
	if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
	ValueList Left, Right;
	reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
	buildTree_rec(Left, Depth + 1, UserTreeIdx);
	buildTree_rec(Right, Depth + 1, UserTreeIdx);
	return;
	}

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;

	case Instruction::GetElementPtr: {
	// We don't combine GEPs with complicated (nested) indexing.
	for (unsigned j = 0; j < VL.size(); ++j) {
	if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
	DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	// We can't combine several GEPs into one vector if they operate on
	// different types.
	Type *Ty0 = VL0->getOperand(0)->getType();
	for (unsigned j = 0; j < VL.size(); ++j) {
	Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
	if (Ty0 != CurTy) {
	DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	// We don't combine GEPs with non-constant indexes.
	for (unsigned j = 0; j < VL.size(); ++j) {
	auto Op = cast<Instruction>(VL[j])->getOperand(1);
	if (!isa<ConstantInt>(Op)) {
	DEBUG(
	dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	return;
	}
	}

	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
	for (unsigned i = 0, e = 2; i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;
	}
	case Instruction::Store: {
	// Check if the stores are consecutive or of we need to swizzle them.
	for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
	if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, SE)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
	return;
	}

	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a vector of stores.\n");

	ValueList Operands;
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(0));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	return;
	}
	case Instruction::Call: {
	// Check if the calls are all to the same vectorizable intrinsic.
	CallInst *CI = cast<CallInst>(VL0);
	// Check if this is an Intrinsic call or something that can be
	// represented by an intrinsic call
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	if (!isTriviallyVectorizable(ID)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
	return;
	}
	Function *Int = CI->getCalledFunction();
	Value *A1I = nullptr;
	if (hasVectorInstrinsicScalarOpd(ID, 1))
	A1I = CI->getArgOperand(1);
	for (unsigned i = 1, e = VL.size(); i != e; ++i) {
	CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
	if (!CI2 \|\| CI2->getCalledFunction() != Int \|\|
	getVectorIntrinsicIDForCall(CI2, TLI) != ID \|\|
	!CI->hasIdenticalOperandBundleSchema(*CI2)) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << VL[i]
	<< "\n");
	return;
	}
	// ctlz,cttz and powi are special intrinsics whose second argument
	// should be same in order for them to be vectorized.
	if (hasVectorInstrinsicScalarOpd(ID, 1)) {
	Value *A1J = CI2->getArgOperand(1);
	if (A1I != A1J) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
	<< " argument "<< A1I<<"!=" << A1J
	<< "\n");
	return;
	}
	}
	// Verify that the bundle operands are identical between the two calls.
	if (CI->hasOperandBundles() &&
	!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
	CI->op_begin() + CI->getBundleOperandsEndIndex(),
	CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
	<< *VL[i] << '\n');
	return;
	}
	}

	newTreeEntry(VL, true, UserTreeIdx);
	for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL) {
	CallInst *CI2 = dyn_cast<CallInst>(j);
	Operands.push_back(CI2->getArgOperand(i));
	}
	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;
	}
	case Instruction::ShuffleVector:
	// If this is not an alternate sequence of opcode like add-sub
	// then do not vectorize this instruction.
	if (!S.IsAltShuffle) {
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
	return;
	}
	newTreeEntry(VL, true, UserTreeIdx);
	DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

	// Reorder operands if reordering would enable vectorization.
	if (isa<BinaryOperator>(VL0)) {
	ValueList Left, Right;
	reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
	buildTree_rec(Left, Depth + 1, UserTreeIdx);
	buildTree_rec(Right, Depth + 1, UserTreeIdx);
	return;
	}

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1, UserTreeIdx);
	}
	return;

	default:
	BS.cancelScheduling(VL, VL0);
	newTreeEntry(VL, false, UserTreeIdx);
	DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
	return;
	}
	}

	unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
	unsigned N;
	Type *EltTy;
	auto *ST = dyn_cast<StructType>(T);
	if (ST) {
	N = ST->getNumElements();
	EltTy = *ST->element_begin();
	} else {
	N = cast<ArrayType>(T)->getNumElements();
	EltTy = cast<ArrayType>(T)->getElementType();
	}
	if (!isValidElementType(EltTy))
	return 0;
	uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\| VTSize != DL.getTypeStoreSizeInBits(T))
	return 0;
	if (ST) {
	// Check that struct is homogeneous.
	for (const auto *Ty : ST->elements())
	if (Ty != EltTy)
	return 0;
	}
	return N;
	}

	bool BoUpSLP::canReuseExtract(ArrayRef<Value > VL, Value OpValue) const {
	Instruction *E0 = cast<Instruction>(OpValue);
	assert(E0->getOpcode() == Instruction::ExtractElement \|\|
	E0->getOpcode() == Instruction::ExtractValue);
	assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
	// Check if all of the extracts come from the same vector and from the
	// correct offset.
	Value *Vec = E0->getOperand(0);

	// We have to extract from a vector/aggregate with the same number of elements.
	unsigned NElts;
	if (E0->getOpcode() == Instruction::ExtractValue) {
	const DataLayout &DL = E0->getModule()->getDataLayout();
	NElts = canMapToVector(Vec->getType(), DL);
	if (!NElts)
	return false;
	// Check if load can be rewritten as load of vector.
	LoadInst *LI = dyn_cast<LoadInst>(Vec);
	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(VL.size()))
	return false;
	} else {
	NElts = Vec->getType()->getVectorNumElements();
	}

	if (NElts != VL.size())
	return false;

	// Check that all of the indices extract from the correct offset.
	for (unsigned I = 0, E = VL.size(); I < E; ++I) {
	Instruction *Inst = cast<Instruction>(VL[I]);
	if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
	return false;
	if (Inst->getOperand(0) != Vec)
	return false;
	}

	return true;
	}

	bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
	return I->hasOneUse() \|\|
	std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
	return ScalarToTreeEntry.count(U) > 0;
	});
	}

	int BoUpSLP::getEntryCost(TreeEntry *E) {
	ArrayRef<Value*> VL = E->Scalars;

	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
	ScalarTy = CI->getOperand(0)->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

	// If we have computed a smaller type for the expression, update VecTy so
	// that the costs will be accurate.
	if (MinBWs.count(VL[0]))
	VecTy = VectorType::get(
	IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

	if (E->NeedToGather) {
	if (allConstant(VL))
	return 0;
	if (isSplat(VL)) {
	return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
	}
	if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
	Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
	if (ShuffleKind.hasValue()) {
	int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
	for (auto *V : VL) {
	// If all users of instruction are going to be vectorized and this
	// instruction itself is not going to be vectorized, consider this
	// instruction as dead and remove its cost from the final cost of the
	// vectorized tree.
	if (areAllUsersVectorized(cast<Instruction>(V)) &&
	!ScalarToTreeEntry.count(V)) {
	auto *IO = cast<ConstantInt>(
	cast<ExtractElementInst>(V)->getIndexOperand());
	Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
	IO->getZExtValue());
	}
	}
	return Cost;
	}
	}
	return getGatherCost(E->Scalars);
	}
	InstructionsState S = getSameOpcode(VL);
	assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
	Instruction *VL0 = cast<Instruction>(S.OpValue);
	unsigned ShuffleOrOp = S.IsAltShuffle ?
	(unsigned) Instruction::ShuffleVector : S.Opcode;
	switch (ShuffleOrOp) {
	case Instruction::PHI:
	return 0;

	case Instruction::ExtractValue:
	case Instruction::ExtractElement:
	if (canReuseExtract(VL, S.OpValue)) {
	int DeadCost = 0;
	for (unsigned i = 0, e = VL.size(); i < e; ++i) {
	Instruction *E = cast<Instruction>(VL[i]);
	// If all users are going to be vectorized, instruction can be
	// considered as dead.
	// The same, if have only one user, it will be vectorized for sure.
	if (areAllUsersVectorized(E))
	// Take credit for instruction that will become dead.
	DeadCost +=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
	}
	return -DeadCost;
	}
	return getGatherCost(VecTy);

	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	Type *SrcTy = VL0->getOperand(0)->getType();

	// Calculate the cost of this instruction.
	int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
	VL0->getType(), SrcTy, VL0);

	VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
	int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
	return VecCost - ScalarCost;
	}
	case Instruction::FCmp:
	case Instruction::ICmp:
	case Instruction::Select: {
	// Calculate the cost of this instruction.
	VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
	int ScalarCost = VecTy->getNumElements() *
	TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
	int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
	return VecCost - ScalarCost;
	}
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	// Certain instructions can be cheaper to vectorize if they have a
	// constant second vector operand.
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_UniformConstantValue;
	TargetTransformInfo::OperandValueProperties Op1VP =
	TargetTransformInfo::OP_None;
	TargetTransformInfo::OperandValueProperties Op2VP =
	TargetTransformInfo::OP_None;

	// If all operands are exactly the same ConstantInt then set the
	// operand kind to OK_UniformConstantValue.
	// If instead not all operands are constants, then set the operand kind
	// to OK_AnyValue. If all operands are constants but not the same,
	// then set the operand kind to OK_NonUniformConstantValue.
	ConstantInt *CInt = nullptr;
	for (unsigned i = 0; i < VL.size(); ++i) {
	const Instruction *I = cast<Instruction>(VL[i]);
	if (!isa<ConstantInt>(I->getOperand(1))) {
	Op2VK = TargetTransformInfo::OK_AnyValue;
	break;
	}
	if (i == 0) {
	CInt = cast<ConstantInt>(I->getOperand(1));
	continue;
	}
	if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
	CInt != cast<ConstantInt>(I->getOperand(1)))
	Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
	}
	// FIXME: Currently cost of model modification for division by power of
	// 2 is handled for X86 and AArch64. Add support for other targets.
	if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
	CInt->getValue().isPowerOf2())
	Op2VP = TargetTransformInfo::OP_PowerOf2;

	SmallVector<const Value *, 4> Operands(VL0->operand_values());
	int ScalarCost =
	VecTy->getNumElements() *
	TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
	Op2VP, Operands);
	int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
	Op1VP, Op2VP, Operands);
	return VecCost - ScalarCost;
	}
	case Instruction::GetElementPtr: {
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_UniformConstantValue;

	int ScalarCost =
	VecTy->getNumElements() *
	TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
	int VecCost =
	TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);

	return VecCost - ScalarCost;
	}
	case Instruction::Load: {
	// Cost of wide load - cost of scalar loads.
	unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
	int ScalarLdCost = VecTy->getNumElements() *
	TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
	int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
	VecTy, alignment, 0, VL0);
	return VecLdCost - ScalarLdCost;
	}
	case Instruction::Store: {
	// We know that we can merge the stores. Calculate the cost.
	unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
	int ScalarStCost = VecTy->getNumElements() *
	TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
	int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
	VecTy, alignment, 0, VL0);
	return VecStCost - ScalarStCost;
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(VL0);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	// Calculate the cost of the scalar and vector calls.
	SmallVector<Type*, 4> ScalarTys;
	for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
	ScalarTys.push_back(CI->getArgOperand(op)->getType());

	FastMathFlags FMF;
	if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
	FMF = FPMO->getFastMathFlags();

	int ScalarCallCost = VecTy->getNumElements() *
	TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);

	SmallVector<Value *, 4> Args(CI->arg_operands());
	int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
	VecTy->getNumElements());

	DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
	<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
	<< " for " << *CI << "\n");

	return VecCallCost - ScalarCallCost;
	}
	case Instruction::ShuffleVector: {
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_AnyValue;
	int ScalarCost = 0;
	int VecCost = 0;
	for (Value *i : VL) {
	Instruction *I = cast<Instruction>(i);
	if (!I)
	break;
	ScalarCost +=
	TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
	}
	// VecCost is equal to sum of the cost of creating 2 vectors
	// and the cost of creating shuffle.
	Instruction *I0 = cast<Instruction>(VL[0]);
	VecCost =
	TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
	Instruction *I1 = cast<Instruction>(VL[1]);
	VecCost +=
	TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
	VecCost +=
	TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
	return VecCost - ScalarCost;
	}
	default:
	llvm_unreachable("Unknown instruction");
	}
	}

	bool BoUpSLP::isFullyVectorizableTinyTree() {
	DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
	VectorizableTree.size() << " is fully vectorizable .\n");

	// We only handle trees of heights 1 and 2.
	if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
	return true;

	if (VectorizableTree.size() != 2)
	return false;

	// Handle splat and all-constants stores.
	if (!VectorizableTree[0].NeedToGather &&
	(allConstant(VectorizableTree[1].Scalars) \|\|
	isSplat(VectorizableTree[1].Scalars)))
	return true;

	// Gathering cost would be too much for tiny trees.
	if (VectorizableTree[0].NeedToGather \|\| VectorizableTree[1].NeedToGather)
	return false;

	return true;
	}

	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
	// We can vectorize the tree if its size is greater than or equal to the
	// minimum size specified by the MinTreeSize command line option.
	if (VectorizableTree.size() >= MinTreeSize)
	return false;

	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
	// can vectorize it if we can prove it fully vectorizable.
	if (isFullyVectorizableTinyTree())
	return false;

	assert(VectorizableTree.empty()
	? ExternalUses.empty()
	: true && "We shouldn't have any external users");

	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
	// vectorizable.
	return true;
	}

	int BoUpSLP::getSpillCost() {
	// Walk from the bottom of the tree to the top, tracking which values are
	// live. When we see a call instruction that is not part of our tree,
	// query TTI to see if there is a cost to keeping values live over it
	// (for example, if spills and fills are required).
	unsigned BundleWidth = VectorizableTree.front().Scalars.size();
	int Cost = 0;

	SmallPtrSet<Instruction*, 4> LiveValues;
	Instruction *PrevInst = nullptr;

	for (const auto &N : VectorizableTree) {
	Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
	if (!Inst)
	continue;

	if (!PrevInst) {
	PrevInst = Inst;
	continue;
	}

	// Update LiveValues.
	LiveValues.erase(PrevInst);
	for (auto &J : PrevInst->operands()) {
	if (isa<Instruction>(&J) && getTreeEntry(&J))
	LiveValues.insert(cast<Instruction>(&*J));
	}

	DEBUG(
	dbgs() << "SLP: #LV: " << LiveValues.size();
	for (auto *X : LiveValues)
	dbgs() << " " << X->getName();
	dbgs() << ", Looking at ";
	Inst->dump();
	);

	// Now find the sequence of instructions between PrevInst and Inst.
	BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
	PrevInstIt =
	PrevInst->getIterator().getReverse();
	while (InstIt != PrevInstIt) {
	if (PrevInstIt == PrevInst->getParent()->rend()) {
	PrevInstIt = Inst->getParent()->rbegin();
	continue;
	}

	if (isa<CallInst>(&PrevInstIt) && &PrevInstIt != PrevInst) {
	SmallVector<Type*, 4> V;
	for (auto *II : LiveValues)
	V.push_back(VectorType::get(II->getType(), BundleWidth));
	Cost += TTI->getCostOfKeepingLiveOverCall(V);
	}

	++PrevInstIt;
	}

	PrevInst = Inst;
	}

	return Cost;
	}

	int BoUpSLP::getTreeCost() {
	int Cost = 0;
	DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
	VectorizableTree.size() << ".\n");

	unsigned BundleWidth = VectorizableTree[0].Scalars.size();

	for (TreeEntry &TE : VectorizableTree) {
	int C = getEntryCost(&TE);
	DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
	<< *TE.Scalars[0] << ".\n");
	Cost += C;
	}

	SmallSet<Value *, 16> ExtractCostCalculated;
	int ExtractCost = 0;
	for (ExternalUser &EU : ExternalUses) {
	// We only add extract cost once for the same scalar.
	if (!ExtractCostCalculated.insert(EU.Scalar).second)
	continue;

	// Uses by ephemeral values are free (because the ephemeral value will be
	// removed prior to code generation, and so the extraction will be
	// removed as well).
	if (EphValues.count(EU.User))
	continue;

	// If we plan to rewrite the tree in a smaller type, we will need to sign
	// extend the extracted value back to the original type. Here, we account
	// for the extract and the added cost of the sign extend if needed.
	auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
	auto *ScalarRoot = VectorizableTree[0].Scalars[0];
	if (MinBWs.count(ScalarRoot)) {
	auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
	auto Extend =
	MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
	VecTy = VectorType::get(MinTy, BundleWidth);
	ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
	VecTy, EU.Lane);
	} else {
	ExtractCost +=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
	}
	}

	int SpillCost = getSpillCost();
	Cost += SpillCost + ExtractCost;

	std::string Str;
	{
	raw_string_ostream OS(Str);
	OS << "SLP: Spill Cost = " << SpillCost << ".\n"
	<< "SLP: Extract Cost = " << ExtractCost << ".\n"
	<< "SLP: Total Cost = " << Cost << ".\n";
	}
	DEBUG(dbgs() << Str);

	if (ViewSLPTree)
	ViewGraph(this, "SLP" + F->getName(), false, Str);

	return Cost;
	}

	int BoUpSLP::getGatherCost(Type *Ty) {
	int Cost = 0;
	for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
	Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
	return Cost;
	}

	int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
	// Find the type of the operands in VL.
	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
	// Find the cost of inserting/extracting values from the vector.
	return getGatherCost(VecTy);
	}

	// Reorder commutative operations in alternate shuffle if the resulting vectors
	// are consecutive loads. This would allow us to vectorize the tree.
	// If we have something like-
	// load a[0] - load b[0]
	// load b[1] + load a[1]
	// load a[2] - load b[2]
	// load a[3] + load b[3]
	// Reordering the second load b[1] load a[1] would allow us to vectorize this
	// code.
	void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right) {
	// Push left and right operands of binary operation into Left and Right
	unsigned AltOpcode = getAltOpcode(Opcode);
	(void)AltOpcode;
	for (Value *V : VL) {
	auto *I = cast<Instruction>(V);
	assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
	"Incorrect instruction in vector");
	Left.push_back(I->getOperand(0));
	Right.push_back(I->getOperand(1));
	}

	// Reorder if we have a commutative operation and consecutive access
	// are on either side of the alternate instructions.
	for (unsigned j = 0; j < VL.size() - 1; ++j) {
	if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
	Instruction *VL1 = cast<Instruction>(VL[j]);
	Instruction *VL2 = cast<Instruction>(VL[j + 1]);
	if (VL1->isCommutative() && isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j], Right[j]);
	continue;
	} else if (VL2->isCommutative() &&
	isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	// else unchanged
	}
	}
	if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
	Instruction *VL1 = cast<Instruction>(VL[j]);
	Instruction *VL2 = cast<Instruction>(VL[j + 1]);
	if (VL1->isCommutative() && isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j], Right[j]);
	continue;
	} else if (VL2->isCommutative() &&
	isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	// else unchanged
	}
	}
	}
	}

	// Return true if I should be commuted before adding it's left and right
	// operands to the arrays Left and Right.
	//
	// The vectorizer is trying to either have all elements one side being
	// instruction with the same opcode to enable further vectorization, or having
	// a splat to lower the vectorizing cost.
	static bool shouldReorderOperands(
	int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
	ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
	bool SplatLeft, bool SplatRight, Value &VLeft, Value &VRight) {
	VLeft = I.getOperand(0);
	VRight = I.getOperand(1);
	// If we have "SplatRight", try to see if commuting is needed to preserve it.
	if (SplatRight) {
	if (VRight == Right[i - 1])
	// Preserve SplatRight
	return false;
	if (VLeft == Right[i - 1]) {
	// Commuting would preserve SplatRight, but we don't want to break
	// SplatLeft either, i.e. preserve the original order if possible.
	// (FIXME: why do we care?)
	if (SplatLeft && VLeft == Left[i - 1])
	return false;
	return true;
	}
	}
	// Symmetrically handle Right side.
	if (SplatLeft) {
	if (VLeft == Left[i - 1])
	// Preserve SplatLeft
	return false;
	if (VRight == Left[i - 1])
	return true;
	}

	Instruction *ILeft = dyn_cast<Instruction>(VLeft);
	Instruction *IRight = dyn_cast<Instruction>(VRight);

	// If we have "AllSameOpcodeRight", try to see if the left operands preserves
	// it and not the right, in this case we want to commute.
	if (AllSameOpcodeRight) {
	unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
	if (IRight && RightPrevOpcode == IRight->getOpcode())
	// Do not commute, a match on the right preserves AllSameOpcodeRight
	return false;
	if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
	// We have a match and may want to commute, but first check if there is
	// not also a match on the existing operands on the Left to preserve
	// AllSameOpcodeLeft, i.e. preserve the original order if possible.
	// (FIXME: why do we care?)
	if (AllSameOpcodeLeft && ILeft &&
	cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
	return false;
	return true;
	}
	}
	// Symmetrically handle Left side.
	if (AllSameOpcodeLeft) {
	unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
	if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
	return false;
	if (IRight && LeftPrevOpcode == IRight->getOpcode())
	return true;
	}
	return false;
	}

	void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
	ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right) {
	if (!VL.empty()) {
	// Peel the first iteration out of the loop since there's nothing
	// interesting to do anyway and it simplifies the checks in the loop.
	auto *I = cast<Instruction>(VL[0]);
	Value *VLeft = I->getOperand(0);
	Value *VRight = I->getOperand(1);
	if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
	// Favor having instruction to the right. FIXME: why?
	std::swap(VLeft, VRight);
	Left.push_back(VLeft);
	Right.push_back(VRight);
	}

	// Keep track if we have instructions with all the same opcode on one side.
	bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
	bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
	// Keep track if we have one side with all the same value (broadcast).
	bool SplatLeft = true;
	bool SplatRight = true;

	for (unsigned i = 1, e = VL.size(); i != e; ++i) {
	Instruction *I = cast<Instruction>(VL[i]);
	assert(((I->getOpcode() == Opcode && I->isCommutative()) \|\|
	(I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
	"Can only process commutative instruction");
	// Commute to favor either a splat or maximizing having the same opcodes on
	// one side.
	Value *VLeft;
	Value *VRight;
	if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
	AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
	VRight)) {
	Left.push_back(VRight);
	Right.push_back(VLeft);
	} else {
	Left.push_back(VLeft);
	Right.push_back(VRight);
	}
	// Update Splat* and AllSameOpcode* after the insertion.
	SplatRight = SplatRight && (Right[i - 1] == Right[i]);
	SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
	AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
	(cast<Instruction>(Left[i - 1])->getOpcode() ==
	cast<Instruction>(Left[i])->getOpcode());
	AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
	(cast<Instruction>(Right[i - 1])->getOpcode() ==
	cast<Instruction>(Right[i])->getOpcode());
	}

	// If one operand end up being broadcast, return this operand order.
	if (SplatRight \|\| SplatLeft)
	return;

	// Finally check if we can get longer vectorizable chain by reordering
	// without breaking the good operand order detected above.
	// E.g. If we have something like-
	// load a[0] load b[0]
	// load b[1] load a[1]
	// load a[2] load b[2]
	// load a[3] load b[3]
	// Reordering the second load b[1] load a[1] would allow us to vectorize
	// this code and we still retain AllSameOpcode property.
	// FIXME: This load reordering might break AllSameOpcode in some rare cases
	// such as-
	// add a[0],c[0] load b[0]
	// add a[1],c[2] load b[1]
	// b[2] load b[2]
	// add a[3],c[3] load b[3]
	for (unsigned j = 0; j < VL.size() - 1; ++j) {
	if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
	if (isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	}
	}
	if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
	if (isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	}
	}
	// else unchanged
	}
	}

	void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value > VL, Value OpValue) {
	// Get the basic block this bundle is in. All instructions in the bundle
	// should be in this block.
	auto *Front = cast<Instruction>(OpValue);
	auto *BB = Front->getParent();
	const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
	const unsigned AltOpcode = getAltOpcode(Opcode);
	assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
	return !sameOpcodeOrAlt(Opcode, AltOpcode,
	cast<Instruction>(V)->getOpcode()) \|\|
	cast<Instruction>(V)->getParent() == BB;
	}));

	// The last instruction in the bundle in program order.
	Instruction *LastInst = nullptr;

	// Find the last instruction. The common case should be that BB has been
	// scheduled, and the last instruction is VL.back(). So we start with
	// VL.back() and iterate over schedule data until we reach the end of the
	// bundle. The end of the bundle is marked by null ScheduleData.
	if (BlocksSchedules.count(BB)) {
	auto *Bundle =
	BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
	if (Bundle && Bundle->isPartOfBundle())
	for (; Bundle; Bundle = Bundle->NextInBundle)
	if (Bundle->OpValue == Bundle->Inst)
	LastInst = Bundle->Inst;
	}

	// LastInst can still be null at this point if there's either not an entry
	// for BB in BlocksSchedules or there's no ScheduleData available for
	// VL.back(). This can be the case if buildTree_rec aborts for various
	// reasons (e.g., the maximum recursion depth is reached, the maximum region
	// size is reached, etc.). ScheduleData is initialized in the scheduling
	// "dry-run".
	//
	// If this happens, we can still find the last instruction by brute force. We
	// iterate forwards from Front (inclusive) until we either see all
	// instructions in the bundle or reach the end of the block. If Front is the
	// last instruction in program order, LastInst will be set to Front, and we
	// will visit all the remaining instructions in the block.
	//
	// One of the reasons we exit early from buildTree_rec is to place an upper
	// bound on compile-time. Thus, taking an additional compile-time hit here is
	// not ideal. However, this should be exceedingly rare since it requires that
	// we both exit early from buildTree_rec and that the bundle be out-of-order
	// (causing us to iterate all the way to the end of the block).
	if (!LastInst) {
	SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
	for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
	if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
	LastInst = &I;
	if (Bundle.empty())
	break;
	}
	}

	// Set the insertion point after the last instruction in the bundle. Set the
	// debug location to Front.
	Builder.SetInsertPoint(BB, ++LastInst->getIterator());
	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
	}

	Value BoUpSLP::Gather(ArrayRef<Value > VL, VectorType *Ty) {
	Value *Vec = UndefValue::get(Ty);
	// Generate the 'InsertElement' instruction.
	for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
	Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
	if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
	GatherSeq.insert(Insrt);
	CSEBlocks.insert(Insrt->getParent());

	// Add to our 'need-to-extract' list.
	if (TreeEntry *E = getTreeEntry(VL[i])) {
	// Find which lane we need to extract.
	int FoundLane = -1;
	for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
	// Is this the lane of the scalar that we are looking for ?
	if (E->Scalars[Lane] == VL[i]) {
	FoundLane = Lane;
	break;
	}
	}
	assert(FoundLane >= 0 && "Could not find the correct lane");
	ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
	}
	}
	}

	return Vec;
	}

	Value BoUpSLP::alreadyVectorized(ArrayRef<Value > VL, Value *OpValue) const {
	if (const TreeEntry *En = getTreeEntry(OpValue)) {
	if (En->isSame(VL) && En->VectorizedValue)
	return En->VectorizedValue;
	}
	return nullptr;
	}

	Value BoUpSLP::vectorizeTree(ArrayRef<Value > VL) {
	InstructionsState S = getSameOpcode(VL);
	if (S.Opcode) {
	if (TreeEntry *E = getTreeEntry(S.OpValue)) {
	if (E->isSame(VL))
	return vectorizeTree(E);
	}
	}

	Type *ScalarTy = S.OpValue->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

	return Gather(VL, VecTy);
	}

	Value BoUpSLP::vectorizeTree(TreeEntry E) {
	IRBuilder<>::InsertPointGuard Guard(Builder);

	if (E->VectorizedValue) {
	DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
	return E->VectorizedValue;
	}

	InstructionsState S = getSameOpcode(E->Scalars);
	Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
	Type *ScalarTy = VL0->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());

	if (E->NeedToGather) {
	setInsertPointAfterBundle(E->Scalars, VL0);
	auto *V = Gather(E->Scalars, VecTy);
	E->VectorizedValue = V;
	return V;
	}

	unsigned ShuffleOrOp = S.IsAltShuffle ?
	(unsigned) Instruction::ShuffleVector : S.Opcode;
	switch (ShuffleOrOp) {
	case Instruction::PHI: {
	PHINode *PH = dyn_cast<PHINode>(VL0);
	Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
	PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
	E->VectorizedValue = NewPhi;

	// PHINodes may have multiple entries from the same block. We want to
	// visit every block once.
	SmallSet<BasicBlock*, 4> VisitedBBs;

	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	ValueList Operands;
	BasicBlock *IBB = PH->getIncomingBlock(i);

	if (!VisitedBBs.insert(IBB).second) {
	NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
	continue;
	}

	// Prepare the operand vector.
	for (Value *V : E->Scalars)
	Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));

	Builder.SetInsertPoint(IBB->getTerminator());
	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
	Value *Vec = vectorizeTree(Operands);
	NewPhi->addIncoming(Vec, IBB);
	}

	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
	"Invalid number of incoming values");
	return NewPhi;
	}

	case Instruction::ExtractElement: {
	if (canReuseExtract(E->Scalars, VL0)) {
	Value *V = VL0->getOperand(0);
	E->VectorizedValue = V;
	return V;
	}
	setInsertPointAfterBundle(E->Scalars, VL0);
	auto *V = Gather(E->Scalars, VecTy);
	E->VectorizedValue = V;
	return V;
	}
	case Instruction::ExtractValue: {
	if (canReuseExtract(E->Scalars, VL0)) {
	LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
	Builder.SetInsertPoint(LI);
	PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
	Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
	LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
	E->VectorizedValue = V;
	return propagateMetadata(V, E->Scalars);
	}
	setInsertPointAfterBundle(E->Scalars, VL0);
	auto *V = Gather(E->Scalars, VecTy);
	E->VectorizedValue = V;
	return V;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	ValueList INVL;
	for (Value *V : E->Scalars)
	INVL.push_back(cast<Instruction>(V)->getOperand(0));

	setInsertPointAfterBundle(E->Scalars, VL0);

	Value *InVec = vectorizeTree(INVL);

	if (Value *V = alreadyVectorized(E->Scalars, VL0))
	return V;

	CastInst *CI = dyn_cast<CastInst>(VL0);
	Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::FCmp:
	case Instruction::ICmp: {
	ValueList LHSV, RHSV;
	for (Value *V : E->Scalars) {
	LHSV.push_back(cast<Instruction>(V)->getOperand(0));
	RHSV.push_back(cast<Instruction>(V)->getOperand(1));
	}

	setInsertPointAfterBundle(E->Scalars, VL0);

	Value *L = vectorizeTree(LHSV);
	Value *R = vectorizeTree(RHSV);

	if (Value *V = alreadyVectorized(E->Scalars, VL0))
	return V;

	CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
	Value *V;
	if (S.Opcode == Instruction::FCmp)
	V = Builder.CreateFCmp(P0, L, R);
	else
	V = Builder.CreateICmp(P0, L, R);

	E->VectorizedValue = V;
	propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
	++NumVectorInstructions;
	return V;
	}
	case Instruction::Select: {
	ValueList TrueVec, FalseVec, CondVec;
	for (Value *V : E->Scalars) {
	CondVec.push_back(cast<Instruction>(V)->getOperand(0));
	TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
	FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
	}

	setInsertPointAfterBundle(E->Scalars, VL0);

	Value *Cond = vectorizeTree(CondVec);
	Value *True = vectorizeTree(TrueVec);
	Value *False = vectorizeTree(FalseVec);

	if (Value *V = alreadyVectorized(E->Scalars, VL0))
	return V;

	Value *V = Builder.CreateSelect(Cond, True, False);
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	ValueList LHSVL, RHSVL;
	if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
	reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
	RHSVL);
	else
	for (Value *V : E->Scalars) {
	auto *I = cast<Instruction>(V);
	LHSVL.push_back(I->getOperand(0));
	RHSVL.push_back(I->getOperand(1));
	}

	setInsertPointAfterBundle(E->Scalars, VL0);

	Value *LHS = vectorizeTree(LHSVL);
	Value *RHS = vectorizeTree(RHSVL);

	if (Value *V = alreadyVectorized(E->Scalars, VL0))
	return V;

	Value *V = Builder.CreateBinOp(
	static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
	E->VectorizedValue = V;
	propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
	++NumVectorInstructions;

	if (Instruction *I = dyn_cast<Instruction>(V))
	return propagateMetadata(I, E->Scalars);

	return V;
	}
	case Instruction::Load: {
	// Loads are inserted at the head of the tree because we don't want to
	// sink them all the way down past store instructions.
	setInsertPointAfterBundle(E->Scalars, VL0);

	LoadInst *LI = cast<LoadInst>(VL0);
	Type *ScalarLoadTy = LI->getType();
	unsigned AS = LI->getPointerAddressSpace();

	Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
	VecTy->getPointerTo(AS));

	// The pointer operand uses an in-tree scalar so we add the new BitCast to
	// ExternalUses list to make sure that an extract will be generated in the
	// future.
	Value *PO = LI->getPointerOperand();
	if (getTreeEntry(PO))
	ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));

	unsigned Alignment = LI->getAlignment();
	LI = Builder.CreateLoad(VecPtr);
	if (!Alignment) {
	Alignment = DL->getABITypeAlignment(ScalarLoadTy);
	}
	LI->setAlignment(Alignment);
	E->VectorizedValue = LI;
	++NumVectorInstructions;
	return propagateMetadata(LI, E->Scalars);
	}
	case Instruction::Store: {
	StoreInst *SI = cast<StoreInst>(VL0);
	unsigned Alignment = SI->getAlignment();
	unsigned AS = SI->getPointerAddressSpace();

	ValueList ScalarStoreValues;
	for (Value *V : E->Scalars)
	ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());

	setInsertPointAfterBundle(E->Scalars, VL0);

	Value *VecValue = vectorizeTree(ScalarStoreValues);
	Value *ScalarPtr = SI->getPointerOperand();
	Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
	StoreInst *S = Builder.CreateStore(VecValue, VecPtr);

	// The pointer operand uses an in-tree scalar, so add the new BitCast to
	// ExternalUses to make sure that an extract will be generated in the
	// future.
	if (getTreeEntry(ScalarPtr))
	ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));

	if (!Alignment)
	Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());

	S->setAlignment(Alignment);
	E->VectorizedValue = S;
	++NumVectorInstructions;
	return propagateMetadata(S, E->Scalars);
	}
	case Instruction::GetElementPtr: {
	setInsertPointAfterBundle(E->Scalars, VL0);

	ValueList Op0VL;
	for (Value *V : E->Scalars)
	Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));

	Value *Op0 = vectorizeTree(Op0VL);

	std::vector<Value *> OpVecs;
	for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
	++j) {
	ValueList OpVL;
	for (Value *V : E->Scalars)
	OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));

	Value *OpVec = vectorizeTree(OpVL);
	OpVecs.push_back(OpVec);
	}

	Value *V = Builder.CreateGEP(
	cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
	E->VectorizedValue = V;
	++NumVectorInstructions;

	if (Instruction *I = dyn_cast<Instruction>(V))
	return propagateMetadata(I, E->Scalars);

	return V;
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(VL0);
	setInsertPointAfterBundle(E->Scalars, VL0);
	Function *FI;
	Intrinsic::ID IID = Intrinsic::not_intrinsic;
	Value *ScalarArg = nullptr;
	if (CI && (FI = CI->getCalledFunction())) {
	IID = FI->getIntrinsicID();
	}
	std::vector<Value *> OpVecs;
	for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
	ValueList OpVL;
	// ctlz,cttz and powi are special intrinsics whose second argument is
	// a scalar. This argument should not be vectorized.
	if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
	CallInst *CEI = cast<CallInst>(VL0);
	ScalarArg = CEI->getArgOperand(j);
	OpVecs.push_back(CEI->getArgOperand(j));
	continue;
	}
	for (Value *V : E->Scalars) {
	CallInst *CEI = cast<CallInst>(V);
	OpVL.push_back(CEI->getArgOperand(j));
	}

	Value *OpVec = vectorizeTree(OpVL);
	DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
	OpVecs.push_back(OpVec);
	}

	Module *M = F->getParent();
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
	Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
	SmallVector<OperandBundleDef, 1> OpBundles;
	CI->getOperandBundlesAsDefs(OpBundles);
	Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

	// The scalar argument uses an in-tree scalar so we add the new vectorized
	// call to ExternalUses list to make sure that an extract will be
	// generated in the future.
	if (ScalarArg && getTreeEntry(ScalarArg))
	ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));

	E->VectorizedValue = V;
	propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
	++NumVectorInstructions;
	return V;
	}
	case Instruction::ShuffleVector: {
	ValueList LHSVL, RHSVL;
	assert(Instruction::isBinaryOp(S.Opcode) &&
	"Invalid Shuffle Vector Operand");
	reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
	setInsertPointAfterBundle(E->Scalars, VL0);

	Value *LHS = vectorizeTree(LHSVL);
	Value *RHS = vectorizeTree(RHSVL);

	if (Value *V = alreadyVectorized(E->Scalars, VL0))
	return V;

	// Create a vector of LHS op1 RHS
	Value *V0 = Builder.CreateBinOp(
	static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);

	unsigned AltOpcode = getAltOpcode(S.Opcode);
	// Create a vector of LHS op2 RHS
	Value *V1 = Builder.CreateBinOp(
	static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);

	// Create shuffle to take alternate operations from the vector.
	// Also, gather up odd and even scalar ops to propagate IR flags to
	// each vector operation.
	ValueList OddScalars, EvenScalars;
	unsigned e = E->Scalars.size();
	SmallVector<Constant *, 8> Mask(e);
	for (unsigned i = 0; i < e; ++i) {
	if (isOdd(i)) {
	Mask[i] = Builder.getInt32(e + i);
	OddScalars.push_back(E->Scalars[i]);
	} else {
	Mask[i] = Builder.getInt32(i);
	EvenScalars.push_back(E->Scalars[i]);
	}
	}

	Value *ShuffleMask = ConstantVector::get(Mask);
	propagateIRFlags(V0, EvenScalars);
	propagateIRFlags(V1, OddScalars);

	Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
	E->VectorizedValue = V;
	++NumVectorInstructions;
	if (Instruction *I = dyn_cast<Instruction>(V))
	return propagateMetadata(I, E->Scalars);

	return V;
	}
	default:
	llvm_unreachable("unknown inst");
	}
	return nullptr;
	}

	Value *BoUpSLP::vectorizeTree() {
	ExtraValueToDebugLocsMap ExternallyUsedValues;
	return vectorizeTree(ExternallyUsedValues);
	}

	Value *
	BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
	// All blocks must be scheduled before any instructions are inserted.
	for (auto &BSIter : BlocksSchedules) {
	scheduleBlock(BSIter.second.get());
	}

	Builder.SetInsertPoint(&F->getEntryBlock().front());
	auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);

	// If the vectorized tree can be rewritten in a smaller type, we truncate the
	// vectorized root. InstCombine will then rewrite the entire expression. We
	// sign extend the extracted values below.
	auto *ScalarRoot = VectorizableTree[0].Scalars[0];
	if (MinBWs.count(ScalarRoot)) {
	if (auto *I = dyn_cast<Instruction>(VectorRoot))
	Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
	auto BundleWidth = VectorizableTree[0].Scalars.size();
	auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
	auto *VecTy = VectorType::get(MinTy, BundleWidth);
	auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
	VectorizableTree[0].VectorizedValue = Trunc;
	}

	DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");

	// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
	// specified by ScalarType.
	auto extend = [&](Value ScalarRoot, Value Ex, Type *ScalarType) {
	if (!MinBWs.count(ScalarRoot))
	return Ex;
	if (MinBWs[ScalarRoot].second)
	return Builder.CreateSExt(Ex, ScalarType);
	return Builder.CreateZExt(Ex, ScalarType);
	};

	// Extract all of the elements with the external uses.
	for (const auto &ExternalUse : ExternalUses) {
	Value *Scalar = ExternalUse.Scalar;
	llvm::User *User = ExternalUse.User;

	// Skip users that we already RAUW. This happens when one instruction
	// has multiple uses of the same value.
	if (User && !is_contained(Scalar->users(), User))
	continue;
	TreeEntry *E = getTreeEntry(Scalar);
	assert(E && "Invalid scalar");
	assert(!E->NeedToGather && "Extracting from a gather list");

	Value *Vec = E->VectorizedValue;
	assert(Vec && "Can't find vectorizable value");

	Value *Lane = Builder.getInt32(ExternalUse.Lane);
	// If User == nullptr, the Scalar is used as extra arg. Generate
	// ExtractElement instruction and update the record for this scalar in
	// ExternallyUsedValues.
	if (!User) {
	assert(ExternallyUsedValues.count(Scalar) &&
	"Scalar with nullptr as an external user must be registered in "
	"ExternallyUsedValues map");
	if (auto *VecI = dyn_cast<Instruction>(Vec)) {
	Builder.SetInsertPoint(VecI->getParent(),
	std::next(VecI->getIterator()));
	} else {
	Builder.SetInsertPoint(&F->getEntryBlock().front());
	}
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
	auto &Locs = ExternallyUsedValues[Scalar];
	ExternallyUsedValues.insert({Ex, Locs});
	ExternallyUsedValues.erase(Scalar);
	continue;
	}

	// Generate extracts for out-of-tree users.
	// Find the insertion point for the extractelement lane.
	if (auto *VecI = dyn_cast<Instruction>(Vec)) {
	if (PHINode *PH = dyn_cast<PHINode>(User)) {
	for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
	if (PH->getIncomingValue(i) == Scalar) {
	TerminatorInst *IncomingTerminator =
	PH->getIncomingBlock(i)->getTerminator();
	if (isa<CatchSwitchInst>(IncomingTerminator)) {
	Builder.SetInsertPoint(VecI->getParent(),
	std::next(VecI->getIterator()));
	} else {
	Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
	}
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(PH->getIncomingBlock(i));
	PH->setOperand(i, Ex);
	}
	}
	} else {
	Builder.SetInsertPoint(cast<Instruction>(User));
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(cast<Instruction>(User)->getParent());
	User->replaceUsesOfWith(Scalar, Ex);
	}
	} else {
	Builder.SetInsertPoint(&F->getEntryBlock().front());
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(&F->getEntryBlock());
	User->replaceUsesOfWith(Scalar, Ex);
	}

	DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
	}

	// For each vectorized value:
	for (TreeEntry &EIdx : VectorizableTree) {
	TreeEntry *Entry = &EIdx;

	// No need to handle users of gathered values.
	if (Entry->NeedToGather)
	continue;

	assert(Entry->VectorizedValue && "Can't find vectorizable value");

	// For each lane:
	for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
	Value *Scalar = Entry->Scalars[Lane];

	Type *Ty = Scalar->getType();
	if (!Ty->isVoidTy()) {
	#ifndef NDEBUG
	for (User *U : Scalar->users()) {
	DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");

	// It is legal to replace users in the ignorelist by undef.
	assert((getTreeEntry(U) \|\| is_contained(UserIgnoreList, U)) &&
	"Replacing out-of-tree value with undef");
	}
	#endif
	Value *Undef = UndefValue::get(Ty);
	Scalar->replaceAllUsesWith(Undef);
	}
	DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
	eraseInstruction(cast<Instruction>(Scalar));
	}
	}

	Builder.ClearInsertionPoint();

	return VectorizableTree[0].VectorizedValue;
	}

	void BoUpSLP::optimizeGatherSequence(Function &F) {
	DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
	<< " gather sequences instructions.\n");
	// LICM InsertElementInst sequences.
	for (Instruction *it : GatherSeq) {
	InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);

	if (!Insert)
	continue;

	// Check if this block is inside a loop.
	Loop *L = LI->getLoopFor(Insert->getParent());
	if (!L)
	continue;

	// Check if it has a preheader.
	BasicBlock *PreHeader = L->getLoopPreheader();
	if (!PreHeader)
	continue;

	// If the vector or the element that we insert into it are
	// instructions that are defined in this basic block then we can't
	// hoist this instruction.
	Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
	Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
	if (CurrVec && L->contains(CurrVec))
	continue;
	if (NewElem && L->contains(NewElem))
	continue;

	// We can hoist this instruction. Move it to the pre-header.
	Insert->moveBefore(PreHeader->getTerminator());
	}

	// Perform O(N^2) search over the gather sequences and merge identical
	// instructions. TODO: We can further optimize this scan if we split the
	// instructions into different buckets based on the insert lane.
	SmallVector<Instruction *, 16> Visited;
	ReversePostOrderTraversal<Function *> RPOT(&F);
	for (auto BB : RPOT) {
	// Traverse CSEBlocks by RPOT order.
	if (!CSEBlocks.count(BB))
	continue;

	// For all instructions in blocks containing gather sequences:
	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
	Instruction In = &it++;
	if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
	continue;

	// Check if we can replace this instruction with any of the
	// visited instructions.
	for (Instruction *v : Visited) {
	if (In->isIdenticalTo(v) &&
	DT->dominates(v->getParent(), In->getParent())) {
	In->replaceAllUsesWith(v);
	eraseInstruction(In);
	In = nullptr;
	break;
	}
	}
	if (In) {
	assert(!is_contained(Visited, In));
	Visited.push_back(In);
	}
	}
	}
	CSEBlocks.clear();
	GatherSeq.clear();
	}

	// Groups the instructions to a bundle (which is then a single scheduling entity)
	// and schedules instructions until the bundle gets ready.
	bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
	BoUpSLP SLP, Value OpValue) {
	if (isa<PHINode>(OpValue))
	return true;

	// Initialize the instruction bundle.
	Instruction *OldScheduleEnd = ScheduleEnd;
	ScheduleData *PrevInBundle = nullptr;
	ScheduleData *Bundle = nullptr;
	bool ReSchedule = false;
	DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n");

	// Make sure that the scheduling region contains all
	// instructions of the bundle.
	for (Value *V : VL) {
	if (!extendSchedulingRegion(V, OpValue))
	return false;
	}

	for (Value *V : VL) {
	ScheduleData *BundleMember = getScheduleData(V);
	assert(BundleMember &&
	"no ScheduleData for bundle member (maybe not in same basic block)");
	if (BundleMember->IsScheduled) {
	// A bundle member was scheduled as single instruction before and now
	// needs to be scheduled as part of the bundle. We just get rid of the
	// existing schedule.
	DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
	<< " was already scheduled\n");
	ReSchedule = true;
	}
	assert(BundleMember->isSchedulingEntity() &&
	"bundle member already part of other bundle");
	if (PrevInBundle) {
	PrevInBundle->NextInBundle = BundleMember;
	} else {
	Bundle = BundleMember;
	}
	BundleMember->UnscheduledDepsInBundle = 0;
	Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;

	// Group the instructions to a bundle.
	BundleMember->FirstInBundle = Bundle;
	PrevInBundle = BundleMember;
	}
	if (ScheduleEnd != OldScheduleEnd) {
	// The scheduling region got new instructions at the lower end (or it is a
	// new region for the first bundle). This makes it necessary to
	// recalculate all dependencies.
	// It is seldom that this needs to be done a second time after adding the
	// initial bundle to the region.
	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	doForAllOpcodes(I, [](ScheduleData *SD) {
	SD->clearDependencies();
	});
	}
	ReSchedule = true;
	}
	if (ReSchedule) {
	resetSchedule();
	initialFillReadyList(ReadyInsts);
	}

	DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
	<< BB->getName() << "\n");

	calculateDependencies(Bundle, true, SLP);

	// Now try to schedule the new bundle. As soon as the bundle is "ready" it
	// means that there are no cyclic dependencies and we can schedule it.
	// Note that's important that we don't "schedule" the bundle yet (see
	// cancelScheduling).
	while (!Bundle->isReady() && !ReadyInsts.empty()) {

	ScheduleData *pickedSD = ReadyInsts.back();
	ReadyInsts.pop_back();

	if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
	schedule(pickedSD, ReadyInsts);
	}
	}
	if (!Bundle->isReady()) {
	cancelScheduling(VL, OpValue);
	return false;
	}
	return true;
	}

	void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
	Value *OpValue) {
	if (isa<PHINode>(OpValue))
	return;

	ScheduleData *Bundle = getScheduleData(OpValue);
	DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
	assert(!Bundle->IsScheduled &&
	"Can't cancel bundle which is already scheduled");
	assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
	"tried to unbundle something which is not a bundle");

	// Un-bundle: make single instructions out of the bundle.
	ScheduleData *BundleMember = Bundle;
	while (BundleMember) {
	assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
	BundleMember->FirstInBundle = BundleMember;
	ScheduleData *Next = BundleMember->NextInBundle;
	BundleMember->NextInBundle = nullptr;
	BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
	if (BundleMember->UnscheduledDepsInBundle == 0) {
	ReadyInsts.insert(BundleMember);
	}
	BundleMember = Next;
	}
	}

	BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
	// Allocate a new ScheduleData for the instruction.
	if (ChunkPos >= ChunkSize) {
	ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
	ChunkPos = 0;
	}
	return &(ScheduleDataChunks.back()[ChunkPos++]);
	}

	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
	Value *OpValue) {
	if (getScheduleData(V, isOneOf(OpValue, V)))
	return true;
	Instruction *I = dyn_cast<Instruction>(V);
	assert(I && "bundle member must be an instruction");
	assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
	auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
	ScheduleData *ISD = getScheduleData(I);
	if (!ISD)
	return false;
	assert(isInSchedulingRegion(ISD) &&
	"ScheduleData not in scheduling region");
	ScheduleData *SD = allocateScheduleDataChunks();
	SD->Inst = I;
	SD->init(SchedulingRegionID, OpValue);
	ExtraScheduleDataMap[I][OpValue] = SD;
	return true;
	};
	if (CheckSheduleForI(I))
	return true;
	if (!ScheduleStart) {
	// It's the first instruction in the new region.
	initScheduleData(I, I->getNextNode(), nullptr, nullptr);
	ScheduleStart = I;
	ScheduleEnd = I->getNextNode();
	if (isOneOf(OpValue, I) != I)
	CheckSheduleForI(I);
	assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
	DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
	return true;
	}
	// Search up and down at the same time, because we don't know if the new
	// instruction is above or below the existing scheduling region.
	BasicBlock::reverse_iterator UpIter =
	++ScheduleStart->getIterator().getReverse();
	BasicBlock::reverse_iterator UpperEnd = BB->rend();
	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
	BasicBlock::iterator LowerEnd = BB->end();
	while (true) {
	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
	DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
	return false;
	}

	if (UpIter != UpperEnd) {
	if (&*UpIter == I) {
	initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
	ScheduleStart = I;
	if (isOneOf(OpValue, I) != I)
	CheckSheduleForI(I);
	DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
	return true;
	}
	UpIter++;
	}
	if (DownIter != LowerEnd) {
	if (&*DownIter == I) {
	initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
	nullptr);
	ScheduleEnd = I->getNextNode();
	if (isOneOf(OpValue, I) != I)
	CheckSheduleForI(I);
	assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
	DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
	return true;
	}
	DownIter++;
	}
	assert((UpIter != UpperEnd \|\| DownIter != LowerEnd) &&
	"instruction not found in block");
	}
	return true;
	}

	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
	Instruction *ToI,
	ScheduleData *PrevLoadStore,
	ScheduleData *NextLoadStore) {
	ScheduleData *CurrentLoadStore = PrevLoadStore;
	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
	ScheduleData *SD = ScheduleDataMap[I];
	if (!SD) {
	SD = allocateScheduleDataChunks();
	ScheduleDataMap[I] = SD;
	SD->Inst = I;
	}
	assert(!isInSchedulingRegion(SD) &&
	"new ScheduleData already in scheduling region");
	SD->init(SchedulingRegionID, I);

	if (I->mayReadOrWriteMemory() &&
	(!isa<IntrinsicInst>(I) \|\|
	cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
	// Update the linked list of memory accessing instructions.
	if (CurrentLoadStore) {
	CurrentLoadStore->NextLoadStore = SD;
	} else {
	FirstLoadStoreInRegion = SD;
	}
	CurrentLoadStore = SD;
	}
	}
	if (NextLoadStore) {
	if (CurrentLoadStore)
	CurrentLoadStore->NextLoadStore = NextLoadStore;
	} else {
	LastLoadStoreInRegion = CurrentLoadStore;
	}
	}

	void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
	bool InsertInReadyList,
	BoUpSLP *SLP) {
	assert(SD->isSchedulingEntity());

	SmallVector<ScheduleData *, 10> WorkList;
	WorkList.push_back(SD);

	while (!WorkList.empty()) {
	ScheduleData *SD = WorkList.back();
	WorkList.pop_back();

	ScheduleData *BundleMember = SD;
	while (BundleMember) {
	assert(isInSchedulingRegion(BundleMember));
	if (!BundleMember->hasValidDependencies()) {

	DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
	BundleMember->Dependencies = 0;
	BundleMember->resetUnscheduledDeps();

	// Handle def-use chain dependencies.
	if (BundleMember->OpValue != BundleMember->Inst) {
	ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
	if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = UseSD->FirstInBundle;
	if (!DestBundle->IsScheduled)
	BundleMember->incrementUnscheduledDeps(1);
	if (!DestBundle->hasValidDependencies())
	WorkList.push_back(DestBundle);
	}
	} else {
	for (User *U : BundleMember->Inst->users()) {
	if (isa<Instruction>(U)) {
	ScheduleData *UseSD = getScheduleData(U);
	if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = UseSD->FirstInBundle;
	if (!DestBundle->IsScheduled)
	BundleMember->incrementUnscheduledDeps(1);
	if (!DestBundle->hasValidDependencies())
	WorkList.push_back(DestBundle);
	}
	} else {
	// I'm not sure if this can ever happen. But we need to be safe.
	// This lets the instruction/bundle never be scheduled and
	// eventually disable vectorization.
	BundleMember->Dependencies++;
	BundleMember->incrementUnscheduledDeps(1);
	}
	}
	}

	// Handle the memory dependencies.
	ScheduleData *DepDest = BundleMember->NextLoadStore;
	if (DepDest) {
	Instruction *SrcInst = BundleMember->Inst;
	MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
	bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
	unsigned numAliased = 0;
	unsigned DistToSrc = 1;

	while (DepDest) {
	assert(isInSchedulingRegion(DepDest));

	// We have two limits to reduce the complexity:
	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
	// SLP->isAliased (which is the expensive part in this loop).
	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
	// the whole loop (even if the loop is fast, it's quadratic).
	// It's important for the loop break condition (see below) to
	// check this limit even between two read-only instructions.
	if (DistToSrc >= MaxMemDepDistance \|\|
	((SrcMayWrite \|\| DepDest->Inst->mayWriteToMemory()) &&
	(numAliased >= AliasedCheckLimit \|\|
	SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

	// We increment the counter only if the locations are aliased
	// (instead of counting all alias checks). This gives a better
	// balance between reduced runtime and accurate dependencies.
	numAliased++;

	DepDest->MemoryDependencies.push_back(BundleMember);
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = DepDest->FirstInBundle;
	if (!DestBundle->IsScheduled) {
	BundleMember->incrementUnscheduledDeps(1);
	}
	if (!DestBundle->hasValidDependencies()) {
	WorkList.push_back(DestBundle);
	}
	}
	DepDest = DepDest->NextLoadStore;

	// Example, explaining the loop break condition: Let's assume our
	// starting instruction is i0 and MaxMemDepDistance = 3.
	//
	// +--------v--v--v
	// i0,i1,i2,i3,i4,i5,i6,i7,i8
	// +--------^--^--^
	//
	// MaxMemDepDistance let us stop alias-checking at i3 and we add
	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
	// Previously we already added dependencies from i3 to i6,i7,i8
	// (because of MaxMemDepDistance). As we added a dependency from
	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
	// and we can abort this loop at i6.
	if (DistToSrc >= 2 * MaxMemDepDistance)
	break;
	DistToSrc++;
	}
	}
	}
	BundleMember = BundleMember->NextInBundle;
	}
	if (InsertInReadyList && SD->isReady()) {
	ReadyInsts.push_back(SD);
	DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
	}
	}
	}

	void BoUpSLP::BlockScheduling::resetSchedule() {
	assert(ScheduleStart &&
	"tried to reset schedule on block which has not been scheduled");
	for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	doForAllOpcodes(I, [&](ScheduleData *SD) {
	assert(isInSchedulingRegion(SD) &&
	"ScheduleData not in scheduling region");
	SD->IsScheduled = false;
	SD->resetUnscheduledDeps();
	});
	}
	ReadyInsts.clear();
	}

	void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
	if (!BS->ScheduleStart)
	return;

	DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");

	BS->resetSchedule();

	// For the real scheduling we use a more sophisticated ready-list: it is
	// sorted by the original instruction location. This lets the final schedule
	// be as close as possible to the original instruction order.
	struct ScheduleDataCompare {
	bool operator()(ScheduleData SD1, ScheduleData SD2) const {
	return SD2->SchedulingPriority < SD1->SchedulingPriority;
	}
	};
	std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

	// Ensure that all dependency data is updated and fill the ready-list with
	// initial instructions.
	int Idx = 0;
	int NumToSchedule = 0;
	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
	I = I->getNextNode()) {
	BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
	assert(SD->isPartOfBundle() ==
	(getTreeEntry(SD->Inst) != nullptr) &&
	"scheduler and vectorizer bundle mismatch");
	SD->FirstInBundle->SchedulingPriority = Idx++;
	if (SD->isSchedulingEntity()) {
	BS->calculateDependencies(SD, false, this);
	NumToSchedule++;
	}
	});
	}
	BS->initialFillReadyList(ReadyInsts);

	Instruction *LastScheduledInst = BS->ScheduleEnd;

	// Do the "real" scheduling.
	while (!ReadyInsts.empty()) {
	ScheduleData picked = ReadyInsts.begin();
	ReadyInsts.erase(ReadyInsts.begin());

	// Move the scheduled instruction(s) to their dedicated places, if not
	// there yet.
	ScheduleData *BundleMember = picked;
	while (BundleMember) {
	Instruction *pickedInst = BundleMember->Inst;
	if (LastScheduledInst->getNextNode() != pickedInst) {
	BS->BB->getInstList().remove(pickedInst);
	BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
	pickedInst);
	}
	LastScheduledInst = pickedInst;
	BundleMember = BundleMember->NextInBundle;
	}

	BS->schedule(picked, ReadyInsts);
	NumToSchedule--;
	}
	assert(NumToSchedule == 0 && "could not schedule all instructions");

	// Avoid duplicate scheduling of the block.
	BS->ScheduleStart = nullptr;
	}

	unsigned BoUpSLP::getVectorElementSize(Value *V) {
	// If V is a store, just return the width of the stored value without
	// traversing the expression tree. This is the common case.
	if (auto *Store = dyn_cast<StoreInst>(V))
	return DL->getTypeSizeInBits(Store->getValueOperand()->getType());

	// If V is not a store, we can traverse the expression tree to find loads
	// that feed it. The type of the loaded value may indicate a more suitable
	// width than V's type. We want to base the vector element size on the width
	// of memory operations where possible.
	SmallVector<Instruction *, 16> Worklist;
	SmallPtrSet<Instruction *, 16> Visited;
	if (auto *I = dyn_cast<Instruction>(V))
	Worklist.push_back(I);

	// Traverse the expression tree in bottom-up order looking for loads. If we
	// encounter an instruciton we don't yet handle, we give up.
	auto MaxWidth = 0u;
	auto FoundUnknownInst = false;
	while (!Worklist.empty() && !FoundUnknownInst) {
	auto *I = Worklist.pop_back_val();
	Visited.insert(I);

	// We should only be looking at scalar instructions here. If the current
	// instruction has a vector type, give up.
	auto *Ty = I->getType();
	if (isa<VectorType>(Ty))
	FoundUnknownInst = true;

	// If the current instruction is a load, update MaxWidth to reflect the
	// width of the loaded value.
	else if (isa<LoadInst>(I))
	MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));

	// Otherwise, we need to visit the operands of the instruction. We only
	// handle the interesting cases from buildTree here. If an operand is an
	// instruction we haven't yet visited, we add it to the worklist.
	else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
	isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {
	for (Use &U : I->operands())
	if (auto *J = dyn_cast<Instruction>(U.get()))
	if (!Visited.count(J))
	Worklist.push_back(J);
	}

	// If we don't yet handle the instruction, give up.
	else
	FoundUnknownInst = true;
	}

	// If we didn't encounter a memory access in the expression tree, or if we
	// gave up for some reason, just return the width of V.
	if (!MaxWidth \|\| FoundUnknownInst)
	return DL->getTypeSizeInBits(V->getType());

	// Otherwise, return the maximum width we found.
	return MaxWidth;
	}

	// Determine if a value V in a vectorizable expression Expr can be demoted to a
	// smaller type with a truncation. We collect the values that will be demoted
	// in ToDemote and additional roots that require investigating in Roots.
	static bool collectValuesToDemote(Value V, SmallPtrSetImpl<Value > &Expr,
	SmallVectorImpl<Value *> &ToDemote,
	SmallVectorImpl<Value *> &Roots) {
	// We can always demote constants.
	if (isa<Constant>(V)) {
	ToDemote.push_back(V);
	return true;
	}

	// If the value is not an instruction in the expression with only one use, it
	// cannot be demoted.
	auto *I = dyn_cast<Instruction>(V);
	if (!I \|\| !I->hasOneUse() \|\| !Expr.count(I))
	return false;

	switch (I->getOpcode()) {

	// We can always demote truncations and extensions. Since truncations can
	// seed additional demotion, we save the truncated value.
	case Instruction::Trunc:
	Roots.push_back(I->getOperand(0));
	break;
	case Instruction::ZExt:
	case Instruction::SExt:
	break;

	// We can demote certain binary operations if we can demote both of their
	// operands.
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Mul:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) \|\|
	!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
	return false;
	break;

	// We can demote selects if we can demote their true and false values.
	case Instruction::Select: {
	SelectInst *SI = cast<SelectInst>(I);
	if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) \|\|
	!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
	return false;
	break;
	}

	// We can demote phis if we can demote all their incoming operands. Note that
	// we don't need to worry about cycles since we ensure single use above.
	case Instruction::PHI: {
	PHINode *PN = cast<PHINode>(I);
	for (Value *IncValue : PN->incoming_values())
	if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
	return false;
	break;
	}

	// Otherwise, conservatively give up.
	default:
	return false;
	}

	// Record the value that we can demote.
	ToDemote.push_back(V);
	return true;
	}

	void BoUpSLP::computeMinimumValueSizes() {
	// If there are no external uses, the expression tree must be rooted by a
	// store. We can't demote in-memory values, so there is nothing to do here.
	if (ExternalUses.empty())
	return;

	// We only attempt to truncate integer expressions.
	auto &TreeRoot = VectorizableTree[0].Scalars;
	auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
	if (!TreeRootIT)
	return;

	// If the expression is not rooted by a store, these roots should have
	// external uses. We will rely on InstCombine to rewrite the expression in
	// the narrower type. However, InstCombine only rewrites single-use values.
	// This means that if a tree entry other than a root is used externally, it
	// must have multiple uses and InstCombine will not rewrite it. The code
	// below ensures that only the roots are used externally.
	SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
	for (auto &EU : ExternalUses)
	if (!Expr.erase(EU.Scalar))
	return;
	if (!Expr.empty())
	return;

	// Collect the scalar values of the vectorizable expression. We will use this
	// context to determine which values can be demoted. If we see a truncation,
	// we mark it as seeding another demotion.
	for (auto &Entry : VectorizableTree)
	Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());

	// Ensure the roots of the vectorizable tree don't form a cycle. They must
	// have a single external user that is not in the vectorizable tree.
	for (auto *Root : TreeRoot)
	if (!Root->hasOneUse() \|\| Expr.count(*Root->user_begin()))
	return;

	// Conservatively determine if we can actually truncate the roots of the
	// expression. Collect the values that can be demoted in ToDemote and
	// additional roots that require investigating in Roots.
	SmallVector<Value *, 32> ToDemote;
	SmallVector<Value *, 4> Roots;
	for (auto *Root : TreeRoot)
	if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
	return;

	// The maximum bit width required to represent all the values that can be
	// demoted without loss of precision. It would be safe to truncate the roots
	// of the expression to this width.
	auto MaxBitWidth = 8u;

	// We first check if all the bits of the roots are demanded. If they're not,
	// we can truncate the roots to this narrower type.
	for (auto *Root : TreeRoot) {
	auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
	MaxBitWidth = std::max<unsigned>(
	Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
	}

	// True if the roots can be zero-extended back to their original type, rather
	// than sign-extended. We know that if the leading bits are not demanded, we
	// can safely zero-extend. So we initialize IsKnownPositive to True.
	bool IsKnownPositive = true;

	// If all the bits of the roots are demanded, we can try a little harder to
	// compute a narrower type. This can happen, for example, if the roots are
	// getelementptr indices. InstCombine promotes these indices to the pointer
	// width. Thus, all their bits are technically demanded even though the
	// address computation might be vectorized in a smaller type.
	//
	// We start by looking at each entry that can be demoted. We compute the
	// maximum bit width required to store the scalar by using ValueTracking to
	// compute the number of high-order bits we can truncate.
	if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
	MaxBitWidth = 8u;

	// Determine if the sign bit of all the roots is known to be zero. If not,
	// IsKnownPositive is set to False.
	IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
	KnownBits Known = computeKnownBits(R, *DL);
	return Known.isNonNegative();
	});

	// Determine the maximum number of bits required to store the scalar
	// values.
	for (auto *Scalar : ToDemote) {
	auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
	auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
	MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
	}

	// If we can't prove that the sign bit is zero, we must add one to the
	// maximum bit width to account for the unknown sign bit. This preserves
	// the existing sign bit so we can safely sign-extend the root back to the
	// original type. Otherwise, if we know the sign bit is zero, we will
	// zero-extend the root instead.
	//
	// FIXME: This is somewhat suboptimal, as there will be cases where adding
	// one to the maximum bit width will yield a larger-than-necessary
	// type. In general, we need to add an extra bit only if we can't
	// prove that the upper bit of the original type is equal to the
	// upper bit of the proposed smaller type. If these two bits are the
	// same (either zero or one) we know that sign-extending from the
	// smaller type will result in the same value. Here, since we can't
	// yet prove this, we are just making the proposed smaller type
	// larger to ensure correctness.
	if (!IsKnownPositive)
	++MaxBitWidth;
	}

	// Round MaxBitWidth up to the next power-of-two.
	if (!isPowerOf2_64(MaxBitWidth))
	MaxBitWidth = NextPowerOf2(MaxBitWidth);

	// If the maximum bit width we compute is less than the with of the roots'
	// type, we can proceed with the narrowing. Otherwise, do nothing.
	if (MaxBitWidth >= TreeRootIT->getBitWidth())
	return;

	// If we can truncate the root, we must collect additional values that might
	// be demoted as a result. That is, those seeded by truncations we will
	// modify.
	while (!Roots.empty())
	collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

	// Finally, map the values we can demote to the maximum bit with we computed.
	for (auto *Scalar : ToDemote)
	MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
	}

	namespace {

	/// The SLPVectorizer Pass.
	struct SLPVectorizer : public FunctionPass {
	SLPVectorizerPass Impl;

	/// Pass identification, replacement for typeid
	static char ID;

	explicit SLPVectorizer() : FunctionPass(ID) {
	initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
	}

	bool doInitialization(Module &M) override {
	return false;
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;

	auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
	auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
	auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
	auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
	auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
	auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

	return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	FunctionPass::getAnalysisUsage(AU);
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<ScalarEvolutionWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<DemandedBitsWrapperPass>();
	AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
	AU.addPreserved<LoopInfoWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<AAResultsWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	AU.setPreservesCFG();
	}
	};

	} // end anonymous namespace

	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
	auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
	auto *AA = &AM.getResult<AAManager>(F);
	auto *LI = &AM.getResult<LoopAnalysis>(F);
	auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
	auto *AC = &AM.getResult<AssumptionAnalysis>(F);
	auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
	auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

	bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
	if (!Changed)
	return PreservedAnalyses::all();

	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	PA.preserve<AAManager>();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
	TargetTransformInfo *TTI_,
	TargetLibraryInfo TLI_, AliasAnalysis AA_,
	LoopInfo LI_, DominatorTree DT_,
	AssumptionCache AC_, DemandedBits DB_,
	OptimizationRemarkEmitter *ORE_) {
	SE = SE_;
	TTI = TTI_;
	TLI = TLI_;
	AA = AA_;
	LI = LI_;
	DT = DT_;
	AC = AC_;
	DB = DB_;
	DL = &F.getParent()->getDataLayout();

	Stores.clear();
	GEPs.clear();
	bool Changed = false;

	// If the target claims to have no vector registers don't attempt
	// vectorization.
	if (!TTI->getNumberOfRegisters(true))
	return false;

	// Don't vectorize when the attribute NoImplicitFloat is used.
	if (F.hasFnAttribute(Attribute::NoImplicitFloat))
	return false;

	DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");

	// Use the bottom up slp vectorizer to construct chains that start with
	// store instructions.
	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);

	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
	// delete instructions.

	// Scan the blocks in the function in post order.
	for (auto BB : post_order(&F.getEntryBlock())) {
	collectSeedInstructions(BB);

	// Vectorize trees that end at stores.
	if (!Stores.empty()) {
	DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
	<< " underlying objects.\n");
	Changed \|= vectorizeStoreChains(R);
	}

	// Vectorize trees that end at reductions.
	Changed \|= vectorizeChainsInBlock(BB, R);

	// Vectorize the index computations of getelementptr instructions. This
	// is primarily intended to catch gather-like idioms ending at
	// non-consecutive loads.
	if (!GEPs.empty()) {
	DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
	<< " underlying objects.\n");
	Changed \|= vectorizeGEPIndices(BB, R);
	}
	}

	if (Changed) {
	R.optimizeGatherSequence(F);
	DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
	DEBUG(verifyFunction(F));
	}
	return Changed;
	}

	/// \brief Check that the Values in the slice in VL array are still existent in
	/// the WeakTrackingVH array.
	/// Vectorization of part of the VL array may cause later values in the VL array
	/// to become invalid. We track when this has happened in the WeakTrackingVH
	/// array.
	static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
	ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
	unsigned SliceSize) {
	VL = VL.slice(SliceBegin, SliceSize);
	VH = VH.slice(SliceBegin, SliceSize);
	return !std::equal(VL.begin(), VL.end(), VH.begin());
	}

	bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
	unsigned VecRegSize) {
	unsigned ChainLen = Chain.size();
	DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
	<< "\n");
	unsigned Sz = R.getVectorElementSize(Chain[0]);
	unsigned VF = VecRegSize / Sz;

	if (!isPowerOf2_32(Sz) \|\| VF < 2)
	return false;

	// Keep track of values that were deleted by vectorizing in the loop below.
	SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());

	bool Changed = false;
	// Look for profitable vectorizable trees at all offsets, starting at zero.
	for (unsigned i = 0, e = ChainLen; i < e; ++i) {
	if (i + VF > e)
	break;

	// Check that a previous iteration of this loop did not delete the Value.
	if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
	continue;

	DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
	<< "\n");
	ArrayRef<Value *> Operands = Chain.slice(i, VF);

	R.buildTree(Operands);
	if (R.isTreeTinyAndNotFullyVectorizable())
	continue;

	R.computeMinimumValueSizes();

	int Cost = R.getTreeCost();

	DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
	if (Cost < -SLPCostThreshold) {
	DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

	using namespace ore;

	R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
	cast<StoreInst>(Chain[i]))
	<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
	<< " and with tree size "
	<< NV("TreeSize", R.getTreeSize()));

	R.vectorizeTree();

	// Move to the next bundle.
	i += VF - 1;
	Changed = true;
	}
	}

	return Changed;
	}

	bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
	BoUpSLP &R) {
	SetVector<StoreInst *> Heads;
	SmallDenseSet<StoreInst *> Tails;
	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;

	// We may run into multiple chains that merge into a single chain. We mark the
	// stores that we vectorized so that we don't visit the same store twice.
	BoUpSLP::ValueSet VectorizedStores;
	bool Changed = false;

	// Do a quadratic search on all of the given stores in reverse order and find
	// all of the pairs of stores that follow each other.
	SmallVector<unsigned, 16> IndexQueue;
	unsigned E = Stores.size();
	IndexQueue.resize(E - 1);
	for (unsigned I = E; I > 0; --I) {
	unsigned Idx = I - 1;
	// If a store has multiple consecutive store candidates, search Stores
	// array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
	// This is because usually pairing with immediate succeeding or preceding
	// candidate create the best chance to find slp vectorization opportunity.
	unsigned Offset = 1;
	unsigned Cnt = 0;
	for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
	if (Idx >= Offset) {
	IndexQueue[Cnt] = Idx - Offset;
	++Cnt;
	}
	if (Idx + Offset < E) {
	IndexQueue[Cnt] = Idx + Offset;
	++Cnt;
	}
	}

	for (auto K : IndexQueue) {
	if (isConsecutiveAccess(Stores[K], Stores[Idx], DL, SE)) {
	Tails.insert(Stores[Idx]);
	Heads.insert(Stores[K]);
	ConsecutiveChain[Stores[K]] = Stores[Idx];
	break;
	}
	}
	}

	// For stores that start but don't end a link in the chain:
	for (auto *SI : llvm::reverse(Heads)) {
	if (Tails.count(SI))
	continue;

	// We found a store instr that starts a chain. Now follow the chain and try
	// to vectorize it.
	BoUpSLP::ValueList Operands;
	StoreInst *I = SI;
	// Collect the chain into a list.
	while ((Tails.count(I) \|\| Heads.count(I)) && !VectorizedStores.count(I)) {
	Operands.push_back(I);
	// Move to the next value in the chain.
	I = ConsecutiveChain[I];
	}

	// FIXME: Is division-by-2 the correct step? Should we assert that the
	// register size is a power-of-2?
	for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
	Size /= 2) {
	if (vectorizeStoreChain(Operands, R, Size)) {
	// Mark the vectorized stores so that we don't vectorize them again.
	VectorizedStores.insert(Operands.begin(), Operands.end());
	Changed = true;
	break;
	}
	}
	}

	return Changed;
	}

	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
	// Initialize the collections. We will make a single pass over the block.
	Stores.clear();
	GEPs.clear();

	// Visit the store and getelementptr instructions in BB and organize them in
	// Stores and GEPs according to the underlying objects of their pointer
	// operands.
	for (Instruction &I : *BB) {
	// Ignore store instructions that are volatile or have a pointer operand
	// that doesn't point to a scalar type.
	if (auto *SI = dyn_cast<StoreInst>(&I)) {
	if (!SI->isSimple())
	continue;
	if (!isValidElementType(SI->getValueOperand()->getType()))
	continue;
	Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
	}

	// Ignore getelementptr instructions that have more than one index, a
	// constant index, or a pointer operand that doesn't point to a scalar
	// type.
	else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
	auto Idx = GEP->idx_begin()->get();
	if (GEP->getNumIndices() > 1 \|\| isa<Constant>(Idx))
	continue;
	if (!isValidElementType(Idx->getType()))
	continue;
	if (GEP->getType()->isVectorTy())
	continue;
	GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
	}
	}
	}

	bool SLPVectorizerPass::tryToVectorizePair(Value A, Value B, BoUpSLP &R) {
	if (!A \|\| !B)
	return false;
	Value *VL[] = { A, B };
	- return tryToVectorizeList(VL, R, None, true);
	+ return tryToVectorizeList(VL, R, true);
	}

	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
	- ArrayRef<Value *> BuildVector,
	- bool AllowReorder,
	- bool NeedExtraction) {
	+ bool AllowReorder) {
	if (VL.size() < 2)
	return false;

	DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
	<< ".\n");

	// Check that all of the parts are scalar instructions of the same type.
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return false;

	unsigned Opcode0 = I0->getOpcode();

	unsigned Sz = R.getVectorElementSize(I0);
	unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
	unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
	if (MaxVF < 2) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(
	SV_NAME, "SmallVF", I0)
	<< "Cannot SLP vectorize list: vectorization factor "
	<< "less than 2 is not supported";
	});
	return false;
	}

	for (Value *V : VL) {
	Type *Ty = V->getType();
	if (!isValidElementType(Ty)) {
	// NOTE: the following will give user internal llvm type name, which may not be useful
	R.getORE()->emit([&]() {
	std::string type_str;
	llvm::raw_string_ostream rso(type_str);
	Ty->print(rso);
	return OptimizationRemarkMissed(
	SV_NAME, "UnsupportedType", I0)
	<< "Cannot SLP vectorize list: type "
	<< rso.str() + " is unsupported by vectorizer";
	});
	return false;
	}
	Instruction *Inst = dyn_cast<Instruction>(V);

	if (!Inst)
	return false;
	if (Inst->getOpcode() != Opcode0) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(
	SV_NAME, "InequableTypes", I0)
	<< "Cannot SLP vectorize list: not all of the "
	<< "parts of scalar instructions are of the same type: "
	<< ore::NV("Instruction1Opcode", I0) << " and "
	<< ore::NV("Instruction2Opcode", Inst);
	});
	return false;
	}
	}

	bool Changed = false;
	bool CandidateFound = false;
	int MinCost = SLPCostThreshold;

	// Keep track of values that were deleted by vectorizing in the loop below.
	SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());

	unsigned NextInst = 0, MaxInst = VL.size();
	for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
	VF /= 2) {
	// No actual vectorization should happen, if number of parts is the same as
	// provided vectorization factor (i.e. the scalar type is used for vector
	// code during codegen).
	auto *VecTy = VectorType::get(VL[0]->getType(), VF);
	if (TTI->getNumberOfParts(VecTy) == VF)
	continue;
	for (unsigned I = NextInst; I < MaxInst; ++I) {
	unsigned OpsWidth = 0;

	if (I + VF > MaxInst)
	OpsWidth = MaxInst - I;
	else
	OpsWidth = VF;

	if (!isPowerOf2_32(OpsWidth) \|\| OpsWidth < 2)
	break;

	// Check that a previous iteration of this loop did not delete the Value.
	if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
	continue;

	DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
	<< "\n");
	ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

	- ArrayRef<Value *> EmptyArray;
	- ArrayRef<Value *> BuildVectorSlice;
	- if (!BuildVector.empty())
	- BuildVectorSlice = BuildVector.slice(I, OpsWidth);
	-
	- R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
	+ R.buildTree(Ops);
	// TODO: check if we can allow reordering for more cases.
	if (AllowReorder && R.shouldReorder()) {
	// Conceptually, there is nothing actually preventing us from trying to
	// reorder a larger list. In fact, we do exactly this when vectorizing
	// reductions. However, at this point, we only expect to get here when
	// there are exactly two operations.
	assert(Ops.size() == 2);
	- assert(BuildVectorSlice.empty());
	Value *ReorderedOps[] = {Ops[1], Ops[0]};
	R.buildTree(ReorderedOps, None);
	}
	if (R.isTreeTinyAndNotFullyVectorizable())
	continue;

	R.computeMinimumValueSizes();
	int Cost = R.getTreeCost();
	CandidateFound = true;
	MinCost = std::min(MinCost, Cost);

	if (Cost < -SLPCostThreshold) {
	DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
	R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
	cast<Instruction>(Ops[0]))
	<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
	<< " and with tree size "
	<< ore::NV("TreeSize", R.getTreeSize()));

	- Value *VectorizedRoot = R.vectorizeTree();
	-
	- // Reconstruct the build vector by extracting the vectorized root. This
	- // way we handle the case where some elements of the vector are
	- // undefined.
	- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
	- if (!BuildVectorSlice.empty()) {
	- // The insert point is the last build vector instruction. The
	- // vectorized root will precede it. This guarantees that we get an
	- // instruction. The vectorized tree could have been constant folded.
	- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
	- unsigned VecIdx = 0;
	- for (auto &V : BuildVectorSlice) {
	- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
	- ++BasicBlock::iterator(InsertAfter));
	- Instruction *I = cast<Instruction>(V);
	- assert(isa<InsertElementInst>(I) \|\| isa<InsertValueInst>(I));
	- Instruction *Extract =
	- cast<Instruction>(Builder.CreateExtractElement(
	- VectorizedRoot, Builder.getInt32(VecIdx++)));
	- I->setOperand(1, Extract);
	- I->moveAfter(Extract);
	- InsertAfter = I;
	- }
	- }
	+ R.vectorizeTree();
	// Move to the next bundle.
	I += VF - 1;
	NextInst = I + 1;
	Changed = true;
	}
	}
	}

	if (!Changed && CandidateFound) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(
	SV_NAME, "NotBeneficial", I0)
	<< "List vectorization was possible but not beneficial with cost "
	<< ore::NV("Cost", MinCost) << " >= "
	<< ore::NV("Treshold", -SLPCostThreshold);
	});
	} else if (!Changed) {
	R.getORE()->emit([&]() {
	return OptimizationRemarkMissed(
	SV_NAME, "NotPossible", I0)
	<< "Cannot SLP vectorize list: vectorization was impossible"
	<< " with available vectorization factors";
	});
	}
	return Changed;
	}

	bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
	if (!I)
	return false;

	if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
	return false;

	Value *P = I->getParent();

	// Vectorize in current basic block only.
	auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
	auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
	if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)
	return false;

	// Try to vectorize V.
	if (tryToVectorizePair(Op0, Op1, R))
	return true;

	auto *A = dyn_cast<BinaryOperator>(Op0);
	auto *B = dyn_cast<BinaryOperator>(Op1);
	// Try to skip B.
	if (B && B->hasOneUse()) {
	auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
	auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
	if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
	return true;
	if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
	return true;
	}

	// Try to skip A.
	if (A && A->hasOneUse()) {
	auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
	auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
	if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
	return true;
	if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
	return true;
	}
	return false;
	}

	/// \brief Generate a shuffle mask to be used in a reduction tree.
	///
	/// \param VecLen The length of the vector to be reduced.
	/// \param NumEltsToRdx The number of elements that should be reduced in the
	/// vector.
	/// \param IsPairwise Whether the reduction is a pairwise or splitting
	/// reduction. A pairwise reduction will generate a mask of
	/// <0,2,...> or <1,3,..> while a splitting reduction will generate
	/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
	/// \param IsLeft True will generate a mask of even elements, odd otherwise.
	static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
	bool IsPairwise, bool IsLeft,
	IRBuilder<> &Builder) {
	assert((IsPairwise \|\| !IsLeft) && "Don't support a <0,1,undef,...> mask");

	SmallVector<Constant *, 32> ShuffleMask(
	VecLen, UndefValue::get(Builder.getInt32Ty()));

	if (IsPairwise)
	// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
	for (unsigned i = 0; i != NumEltsToRdx; ++i)
	ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
	else
	// Move the upper half of the vector to the lower half.
	for (unsigned i = 0; i != NumEltsToRdx; ++i)
	ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);

	return ConstantVector::get(ShuffleMask);
	}

	namespace {

	/// Model horizontal reductions.
	///
	/// A horizontal reduction is a tree of reduction operations (currently add and
	/// fadd) that has operations that can be put into a vector as its leaf.
	/// For example, this tree:
	///
	/// mul mul mul mul
	/// \ / \ /
	/// + +
	/// \ /
	/// +
	/// This tree has "mul" as its reduced values and "+" as its reduction
	/// operations. A reduction might be feeding into a store or a binary operation
	/// feeding a phi.
	/// ...
	/// \ /
	/// +
	/// \|
	/// phi +=
	///
	/// Or:
	/// ...
	/// \ /
	/// +
	/// \|
	/// *p =
	///
	class HorizontalReduction {
	using ReductionOpsType = SmallVector<Value *, 16>;
	using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
	ReductionOpsListType ReductionOps;
	SmallVector<Value *, 32> ReducedVals;
	// Use map vector to make stable output.
	MapVector<Instruction , Value > ExtraArgs;

	/// Kind of the reduction data.
	enum ReductionKind {
	RK_None, /// Not a reduction.
	RK_Arithmetic, /// Binary reduction data.
	RK_Min, /// Minimum reduction data.
	RK_UMin, /// Unsigned minimum reduction data.
	RK_Max, /// Maximum reduction data.
	RK_UMax, /// Unsigned maximum reduction data.
	};

	/// Contains info about operation, like its opcode, left and right operands.
	class OperationData {
	/// Opcode of the instruction.
	unsigned Opcode = 0;

	/// Left operand of the reduction operation.
	Value *LHS = nullptr;

	/// Right operand of the reduction operation.
	Value *RHS = nullptr;

	/// Kind of the reduction operation.
	ReductionKind Kind = RK_None;

	/// True if float point min/max reduction has no NaNs.
	bool NoNaN = false;

	/// Checks if the reduction operation can be vectorized.
	bool isVectorizable() const {
	return LHS && RHS &&
	// We currently only support adds && min/max reductions.
	((Kind == RK_Arithmetic &&
	(Opcode == Instruction::Add \|\| Opcode == Instruction::FAdd)) \|\|
	((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
	(Kind == RK_Min \|\| Kind == RK_Max)) \|\|
	(Opcode == Instruction::ICmp &&
	(Kind == RK_UMin \|\| Kind == RK_UMax)));
	}

	/// Creates reduction operation with the current opcode.
	Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
	assert(isVectorizable() &&
	"Expected add\|fadd or min/max reduction operation.");
	Value *Cmp;
	switch (Kind) {
	case RK_Arithmetic:
	return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
	Name);
	case RK_Min:
	Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
	: Builder.CreateFCmpOLT(LHS, RHS);
	break;
	case RK_Max:
	Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
	: Builder.CreateFCmpOGT(LHS, RHS);
	break;
	case RK_UMin:
	assert(Opcode == Instruction::ICmp && "Expected integer types.");
	Cmp = Builder.CreateICmpULT(LHS, RHS);
	break;
	case RK_UMax:
	assert(Opcode == Instruction::ICmp && "Expected integer types.");
	Cmp = Builder.CreateICmpUGT(LHS, RHS);
	break;
	case RK_None:
	llvm_unreachable("Unknown reduction operation.");
	}
	return Builder.CreateSelect(Cmp, LHS, RHS, Name);
	}

	public:
	explicit OperationData() = default;

	/// Construction for reduced values. They are identified by opcode only and
	/// don't have associated LHS/RHS values.
	explicit OperationData(Value *V) {
	if (auto *I = dyn_cast<Instruction>(V))
	Opcode = I->getOpcode();
	}

	/// Constructor for reduction operations with opcode and its left and
	/// right operands.
	OperationData(unsigned Opcode, Value LHS, Value RHS, ReductionKind Kind,
	bool NoNaN = false)
	: Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
	assert(Kind != RK_None && "One of the reduction operations is expected.");
	}

	explicit operator bool() const { return Opcode; }

	/// Get the index of the first operand.
	unsigned getFirstOperandIndex() const {
	assert(!!*this && "The opcode is not set.");
	switch (Kind) {
	case RK_Min:
	case RK_UMin:
	case RK_Max:
	case RK_UMax:
	return 1;
	case RK_Arithmetic:
	case RK_None:
	break;
	}
	return 0;
	}

	/// Total number of operands in the reduction operation.
	unsigned getNumberOfOperands() const {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	switch (Kind) {
	case RK_Arithmetic:
	return 2;
	case RK_Min:
	case RK_UMin:
	case RK_Max:
	case RK_UMax:
	return 3;
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}

	/// Checks if the operation has the same parent as \p P.
	bool hasSameParent(Instruction I, Value P, bool IsRedOp) const {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	if (!IsRedOp)
	return I->getParent() == P;
	switch (Kind) {
	case RK_Arithmetic:
	// Arithmetic reduction operation must be used once only.
	return I->getParent() == P;
	case RK_Min:
	case RK_UMin:
	case RK_Max:
	case RK_UMax: {
	// SelectInst must be used twice while the condition op must have single
	// use only.
	auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
	return I->getParent() == P && Cmp && Cmp->getParent() == P;
	}
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}
	/// Expected number of uses for reduction operations/reduced values.
	bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	switch (Kind) {
	case RK_Arithmetic:
	return I->hasOneUse();
	case RK_Min:
	case RK_UMin:
	case RK_Max:
	case RK_UMax:
	return I->hasNUses(2) &&
	(!IsReductionOp \|\|
	cast<SelectInst>(I)->getCondition()->hasOneUse());
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}

	/// Initializes the list of reduction operations.
	void initReductionOps(ReductionOpsListType &ReductionOps) {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	switch (Kind) {
	case RK_Arithmetic:
	ReductionOps.assign(1, ReductionOpsType());
	break;
	case RK_Min:
	case RK_UMin:
	case RK_Max:
	case RK_UMax:
	ReductionOps.assign(2, ReductionOpsType());
	break;
	case RK_None:
	llvm_unreachable("Reduction kind is not set");
	}
	}
	/// Add all reduction operations for the reduction instruction \p I.
	void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
	assert(Kind != RK_None && !!*this && LHS && RHS &&
	"Expected reduction operation.");
	switch (Kind) {
	case RK_Arithmetic:
	ReductionOps[0].emplace_back(I);
	break;
	case RK_Min:
	case RK_UMin:
	case RK_Max:
	case RK_UMax:
	ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
	ReductionOps[1].emplace_back(I);
	break;
	case RK_None:
	llvm_unreachable("Reduction kind is not set");
	}
	}

	/// Checks if instruction is associative and can be vectorized.
	bool isAssociative(Instruction *I) const {
	assert(Kind != RK_None && *this && LHS && RHS &&
	"Expected reduction operation.");
	switch (Kind) {
	case RK_Arithmetic:
	return I->isAssociative();
	case RK_Min:
	case RK_Max:
	return Opcode == Instruction::ICmp \|\|
	cast<Instruction>(I->getOperand(0))->isFast();
	case RK_UMin:
	case RK_UMax:
	assert(Opcode == Instruction::ICmp &&
	"Only integer compare operation is expected.");
	return true;
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}

	/// Checks if the reduction operation can be vectorized.
	bool isVectorizable(Instruction *I) const {
	return isVectorizable() && isAssociative(I);
	}

	/// Checks if two operation data are both a reduction op or both a reduced
	/// value.
	bool operator==(const OperationData &OD) {
	assert(((Kind != OD.Kind) \|\| ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
	"One of the comparing operations is incorrect.");
	return this == &OD \|\| (Kind == OD.Kind && Opcode == OD.Opcode);
	}
	bool operator!=(const OperationData &OD) { return !(*this == OD); }
	void clear() {
	Opcode = 0;
	LHS = nullptr;
	RHS = nullptr;
	Kind = RK_None;
	NoNaN = false;
	}

	/// Get the opcode of the reduction operation.
	unsigned getOpcode() const {
	assert(isVectorizable() && "Expected vectorizable operation.");
	return Opcode;
	}

	/// Get kind of reduction data.
	ReductionKind getKind() const { return Kind; }
	Value *getLHS() const { return LHS; }
	Value *getRHS() const { return RHS; }
	Type *getConditionType() const {
	switch (Kind) {
	case RK_Arithmetic:
	return nullptr;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	return CmpInst::makeCmpResultType(LHS->getType());
	case RK_None:
	break;
	}
	llvm_unreachable("Reduction kind is not set");
	}

	/// Creates reduction operation with the current opcode with the IR flags
	/// from \p ReductionOps.
	Value *createOp(IRBuilder<> &Builder, const Twine &Name,
	const ReductionOpsListType &ReductionOps) const {
	assert(isVectorizable() &&
	"Expected add\|fadd or min/max reduction operation.");
	auto *Op = createOp(Builder, Name);
	switch (Kind) {
	case RK_Arithmetic:
	propagateIRFlags(Op, ReductionOps[0]);
	return Op;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	if (auto *SI = dyn_cast<SelectInst>(Op))
	propagateIRFlags(SI->getCondition(), ReductionOps[0]);
	propagateIRFlags(Op, ReductionOps[1]);
	return Op;
	case RK_None:
	break;
	}
	llvm_unreachable("Unknown reduction operation.");
	}
	/// Creates reduction operation with the current opcode with the IR flags
	/// from \p I.
	Value *createOp(IRBuilder<> &Builder, const Twine &Name,
	Instruction *I) const {
	assert(isVectorizable() &&
	"Expected add\|fadd or min/max reduction operation.");
	auto *Op = createOp(Builder, Name);
	switch (Kind) {
	case RK_Arithmetic:
	propagateIRFlags(Op, I);
	return Op;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	if (auto *SI = dyn_cast<SelectInst>(Op)) {
	propagateIRFlags(SI->getCondition(),
	cast<SelectInst>(I)->getCondition());
	}
	propagateIRFlags(Op, I);
	return Op;
	case RK_None:
	break;
	}
	llvm_unreachable("Unknown reduction operation.");
	}

	TargetTransformInfo::ReductionFlags getFlags() const {
	TargetTransformInfo::ReductionFlags Flags;
	Flags.NoNaN = NoNaN;
	switch (Kind) {
	case RK_Arithmetic:
	break;
	case RK_Min:
	Flags.IsSigned = Opcode == Instruction::ICmp;
	Flags.IsMaxOp = false;
	break;
	case RK_Max:
	Flags.IsSigned = Opcode == Instruction::ICmp;
	Flags.IsMaxOp = true;
	break;
	case RK_UMin:
	Flags.IsSigned = false;
	Flags.IsMaxOp = false;
	break;
	case RK_UMax:
	Flags.IsSigned = false;
	Flags.IsMaxOp = true;
	break;
	case RK_None:
	llvm_unreachable("Reduction kind is not set");
	}
	return Flags;
	}
	};

	Instruction *ReductionRoot = nullptr;

	/// The operation data of the reduction operation.
	OperationData ReductionData;

	/// The operation data of the values we perform a reduction on.
	OperationData ReducedValueData;

	/// Should we model this reduction as a pairwise reduction tree or a tree that
	/// splits the vector in halves and adds those halves.
	bool IsPairwiseReduction = false;

	/// Checks if the ParentStackElem.first should be marked as a reduction
	/// operation with an extra argument or as extra argument itself.
	void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
	Value *ExtraArg) {
	if (ExtraArgs.count(ParentStackElem.first)) {
	ExtraArgs[ParentStackElem.first] = nullptr;
	// We ran into something like:
	// ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
	// The whole ParentStackElem.first should be considered as an extra value
	// in this case.
	// Do not perform analysis of remaining operands of ParentStackElem.first
	// instruction, this whole instruction is an extra argument.
	ParentStackElem.second = ParentStackElem.first->getNumOperands();
	} else {
	// We ran into something like:
	// ParentStackElem.first += ... + ExtraArg + ...
	ExtraArgs[ParentStackElem.first] = ExtraArg;
	}
	}

	static OperationData getOperationData(Value *V) {
	if (!V)
	return OperationData();

	Value *LHS;
	Value *RHS;
	if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
	return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
	RK_Arithmetic);
	}
	if (auto *Select = dyn_cast<SelectInst>(V)) {
	// Look for a min/max pattern.
	if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
	} else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
	} else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
	m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(
	Instruction::FCmp, LHS, RHS, RK_Min,
	cast<Instruction>(Select->getCondition())->hasNoNaNs());
	} else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
	} else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
	} else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
	m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
	return OperationData(
	Instruction::FCmp, LHS, RHS, RK_Max,
	cast<Instruction>(Select->getCondition())->hasNoNaNs());
	}
	}
	return OperationData(V);
	}

	public:
	HorizontalReduction() = default;

	/// \brief Try to find a reduction tree.
	bool matchAssociativeReduction(PHINode Phi, Instruction B) {
	assert((!Phi \|\| is_contained(Phi->operands(), B)) &&
	"Thi phi needs to use the binary operator");

	ReductionData = getOperationData(B);

	// We could have a initial reductions that is not an add.
	// r *= v1 + v2 + v3 + v4
	// In such a case start looking for a tree rooted in the first '+'.
	if (Phi) {
	if (ReductionData.getLHS() == Phi) {
	Phi = nullptr;
	B = dyn_cast<Instruction>(ReductionData.getRHS());
	ReductionData = getOperationData(B);
	} else if (ReductionData.getRHS() == Phi) {
	Phi = nullptr;
	B = dyn_cast<Instruction>(ReductionData.getLHS());
	ReductionData = getOperationData(B);
	}
	}

	if (!ReductionData.isVectorizable(B))
	return false;

	Type *Ty = B->getType();
	if (!isValidElementType(Ty))
	return false;

	ReducedValueData.clear();
	ReductionRoot = B;

	// Post order traverse the reduction tree starting at B. We only handle true
	// trees containing only binary operators.
	SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
	Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
	ReductionData.initReductionOps(ReductionOps);
	while (!Stack.empty()) {
	Instruction *TreeN = Stack.back().first;
	unsigned EdgeToVist = Stack.back().second++;
	OperationData OpData = getOperationData(TreeN);
	bool IsReducedValue = OpData != ReductionData;

	// Postorder vist.
	if (IsReducedValue \|\| EdgeToVist == OpData.getNumberOfOperands()) {
	if (IsReducedValue)
	ReducedVals.push_back(TreeN);
	else {
	auto I = ExtraArgs.find(TreeN);
	if (I != ExtraArgs.end() && !I->second) {
	// Check if TreeN is an extra argument of its parent operation.
	if (Stack.size() <= 1) {
	// TreeN can't be an extra argument as it is a root reduction
	// operation.
	return false;
	}
	// Yes, TreeN is an extra argument, do not add it to a list of
	// reduction operations.
	// Stack[Stack.size() - 2] always points to the parent operation.
	markExtraArg(Stack[Stack.size() - 2], TreeN);
	ExtraArgs.erase(TreeN);
	} else
	ReductionData.addReductionOps(TreeN, ReductionOps);
	}
	// Retract.
	Stack.pop_back();
	continue;
	}

	// Visit left or right.
	Value *NextV = TreeN->getOperand(EdgeToVist);
	if (NextV != Phi) {
	auto *I = dyn_cast<Instruction>(NextV);
	OpData = getOperationData(I);
	// Continue analysis if the next operand is a reduction operation or
	// (possibly) a reduced value. If the reduced value opcode is not set,
	// the first met operation != reduction operation is considered as the
	// reduced value class.
	if (I && (!ReducedValueData \|\| OpData == ReducedValueData \|\|
	OpData == ReductionData)) {
	const bool IsReductionOperation = OpData == ReductionData;
	// Only handle trees in the current basic block.
	if (!ReductionData.hasSameParent(I, B->getParent(),
	IsReductionOperation)) {
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	}

	// Each tree node needs to have minimal number of users except for the
	// ultimate reduction.
	if (!ReductionData.hasRequiredNumberOfUses(I,
	OpData == ReductionData) &&
	I != B) {
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	}

	if (IsReductionOperation) {
	// We need to be able to reassociate the reduction operations.
	if (!OpData.isAssociative(I)) {
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	}
	} else if (ReducedValueData &&
	ReducedValueData != OpData) {
	// Make sure that the opcodes of the operations that we are going to
	// reduce match.
	// I is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), I);
	continue;
	} else if (!ReducedValueData)
	ReducedValueData = OpData;

	Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
	continue;
	}
	}
	// NextV is an extra argument for TreeN (its parent operation).
	markExtraArg(Stack.back(), NextV);
	}
	return true;
	}

	/// \brief Attempt to vectorize the tree found by
	/// matchAssociativeReduction.
	bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
	if (ReducedVals.empty())
	return false;

	// If there is a sufficient number of reduction values, reduce
	// to a nearby power-of-2. Can safely generate oversized
	// vectors and rely on the backend to split them to legal sizes.
	unsigned NumReducedVals = ReducedVals.size();
	if (NumReducedVals < 4)
	return false;

	unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);

	Value *VectorizedTree = nullptr;
	IRBuilder<> Builder(ReductionRoot);
	FastMathFlags Unsafe;
	Unsafe.setFast();
	Builder.setFastMathFlags(Unsafe);
	unsigned i = 0;

	BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
	// The same extra argument may be used several time, so log each attempt
	// to use it.
	for (auto &Pair : ExtraArgs)
	ExternallyUsedValues[Pair.second].push_back(Pair.first);
	SmallVector<Value *, 16> IgnoreList;
	for (auto &V : ReductionOps)
	IgnoreList.append(V.begin(), V.end());
	while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
	auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
	V.buildTree(VL, ExternallyUsedValues, IgnoreList);
	if (V.shouldReorder()) {
	SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
	V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
	}
	if (V.isTreeTinyAndNotFullyVectorizable())
	break;

	V.computeMinimumValueSizes();

	// Estimate cost.
	int Cost =
	V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
	if (Cost >= -SLPCostThreshold) {
	V.getORE()->emit([&]() {
	return OptimizationRemarkMissed(
	SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
	<< "Vectorizing horizontal reduction is possible"
	<< "but not beneficial with cost "
	<< ore::NV("Cost", Cost) << " and threshold "
	<< ore::NV("Threshold", -SLPCostThreshold);
	});
	break;
	}

	DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
	<< ". (HorRdx)\n");
	V.getORE()->emit([&]() {
	return OptimizationRemark(
	SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
	<< "Vectorized horizontal reduction with cost "
	<< ore::NV("Cost", Cost) << " and with tree size "
	<< ore::NV("TreeSize", V.getTreeSize());
	});

	// Vectorize a tree.
	DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
	Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

	// Emit a reduction.
	Value *ReducedSubTree =
	emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
	if (VectorizedTree) {
	Builder.SetCurrentDebugLocation(Loc);
	OperationData VectReductionData(ReductionData.getOpcode(),
	VectorizedTree, ReducedSubTree,
	ReductionData.getKind());
	VectorizedTree =
	VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
	} else
	VectorizedTree = ReducedSubTree;
	i += ReduxWidth;
	ReduxWidth = PowerOf2Floor(NumReducedVals - i);
	}

	if (VectorizedTree) {
	// Finish the reduction.
	for (; i < NumReducedVals; ++i) {
	auto *I = cast<Instruction>(ReducedVals[i]);
	Builder.SetCurrentDebugLocation(I->getDebugLoc());
	OperationData VectReductionData(ReductionData.getOpcode(),
	VectorizedTree, I,
	ReductionData.getKind());
	VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
	}
	for (auto &Pair : ExternallyUsedValues) {
	assert(!Pair.second.empty() &&
	"At least one DebugLoc must be inserted");
	// Add each externally used value to the final reduction.
	for (auto *I : Pair.second) {
	Builder.SetCurrentDebugLocation(I->getDebugLoc());
	OperationData VectReductionData(ReductionData.getOpcode(),
	VectorizedTree, Pair.first,
	ReductionData.getKind());
	VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
	}
	}
	// Update users.
	ReductionRoot->replaceAllUsesWith(VectorizedTree);
	}
	return VectorizedTree != nullptr;
	}

	unsigned numReductionValues() const {
	return ReducedVals.size();
	}

	private:
	/// \brief Calculate the cost of a reduction.
	int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal,
	unsigned ReduxWidth) {
	Type *ScalarTy = FirstReducedVal->getType();
	Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);

	int PairwiseRdxCost;
	int SplittingRdxCost;
	switch (ReductionData.getKind()) {
	case RK_Arithmetic:
	PairwiseRdxCost =
	TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
	/IsPairwiseForm=/true);
	SplittingRdxCost =
	TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
	/IsPairwiseForm=/false);
	break;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax: {
	Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
	bool IsUnsigned = ReductionData.getKind() == RK_UMin \|\|
	ReductionData.getKind() == RK_UMax;
	PairwiseRdxCost =
	TTI->getMinMaxReductionCost(VecTy, VecCondTy,
	/IsPairwiseForm=/true, IsUnsigned);
	SplittingRdxCost =
	TTI->getMinMaxReductionCost(VecTy, VecCondTy,
	/IsPairwiseForm=/false, IsUnsigned);
	break;
	}
	case RK_None:
	llvm_unreachable("Expected arithmetic or min/max reduction operation");
	}

	IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
	int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

	int ScalarReduxCost;
	switch (ReductionData.getKind()) {
	case RK_Arithmetic:
	ScalarReduxCost =
	TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
	break;
	case RK_Min:
	case RK_Max:
	case RK_UMin:
	case RK_UMax:
	ScalarReduxCost =
	TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
	TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
	CmpInst::makeCmpResultType(ScalarTy));
	break;
	case RK_None:
	llvm_unreachable("Expected arithmetic or min/max reduction operation");
	}
	ScalarReduxCost *= (ReduxWidth - 1);

	DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
	<< " for reduction that starts with " << *FirstReducedVal
	<< " (It is a "
	<< (IsPairwiseReduction ? "pairwise" : "splitting")
	<< " reduction)\n");

	return VecReduxCost - ScalarReduxCost;
	}

	/// \brief Emit a horizontal reduction of the vectorized value.
	Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder,
	unsigned ReduxWidth, const TargetTransformInfo *TTI) {
	assert(VectorizedValue && "Need to have a vectorized tree node");
	assert(isPowerOf2_32(ReduxWidth) &&
	"We only handle power-of-two reductions for now");

	if (!IsPairwiseReduction)
	return createSimpleTargetReduction(
	Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
	ReductionData.getFlags(), ReductionOps.back());

	Value *TmpVec = VectorizedValue;
	for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
	Value *LeftMask =
	createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
	Value *RightMask =
	createRdxShuffleMask(ReduxWidth, i, true, false, Builder);

	Value *LeftShuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
	Value *RightShuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
	"rdx.shuf.r");
	OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
	RightShuf, ReductionData.getKind());
	TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
	}

	// The result is in the first element of the vector.
	return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
	}
	};

	} // end anonymous namespace

	/// \brief Recognize construction of vectors like
	/// %ra = insertelement <4 x float> undef, float %s0, i32 0
	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
	/// starting from the last insertelement instruction.
	///
	/// Returns true if it matches
	static bool findBuildVector(InsertElementInst *LastInsertElem,
	- SmallVectorImpl<Value *> &BuildVector,
	SmallVectorImpl<Value *> &BuildVectorOpds) {
	Value *V = nullptr;
	do {
	- BuildVector.push_back(LastInsertElem);
	BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
	V = LastInsertElem->getOperand(0);
	if (isa<UndefValue>(V))
	break;
	LastInsertElem = dyn_cast<InsertElementInst>(V);
	if (!LastInsertElem \|\| !LastInsertElem->hasOneUse())
	return false;
	} while (true);
	- std::reverse(BuildVector.begin(), BuildVector.end());
	std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
	return true;
	}

	/// \brief Like findBuildVector, but looks for construction of aggregate.
	///
	/// \return true if it matches.
	static bool findBuildAggregate(InsertValueInst *IV,
	- SmallVectorImpl<Value *> &BuildVector,
	SmallVectorImpl<Value *> &BuildVectorOpds) {
	Value *V;
	do {
	- BuildVector.push_back(IV);
	BuildVectorOpds.push_back(IV->getInsertedValueOperand());
	V = IV->getAggregateOperand();
	if (isa<UndefValue>(V))
	break;
	IV = dyn_cast<InsertValueInst>(V);
	if (!IV \|\| !IV->hasOneUse())
	return false;
	} while (true);
	- std::reverse(BuildVector.begin(), BuildVector.end());
	std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
	return true;
	}

	static bool PhiTypeSorterFunc(Value V, Value V2) {
	return V->getType() < V2->getType();
	}

	/// \brief Try and get a reduction value from a phi node.
	///
	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
	/// if they come from either \p ParentBB or a containing loop latch.
	///
	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
	/// if not possible.
	static Value getReductionValue(const DominatorTree DT, PHINode *P,
	BasicBlock ParentBB, LoopInfo LI) {
	// There are situations where the reduction value is not dominated by the
	// reduction phi. Vectorizing such cases has been reported to cause
	// miscompiles. See PR25787.
	auto DominatedReduxValue = [&](Value *R) {
	return (
	dyn_cast<Instruction>(R) &&
	DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
	};

	Value *Rdx = nullptr;

	// Return the incoming value if it comes from the same BB as the phi node.
	if (P->getIncomingBlock(0) == ParentBB) {
	Rdx = P->getIncomingValue(0);
	} else if (P->getIncomingBlock(1) == ParentBB) {
	Rdx = P->getIncomingValue(1);
	}

	if (Rdx && DominatedReduxValue(Rdx))
	return Rdx;

	// Otherwise, check whether we have a loop latch to look at.
	Loop *BBL = LI->getLoopFor(ParentBB);
	if (!BBL)
	return nullptr;
	BasicBlock *BBLatch = BBL->getLoopLatch();
	if (!BBLatch)
	return nullptr;

	// There is a loop latch, return the incoming value if it comes from
	// that. This reduction pattern occasionally turns up.
	if (P->getIncomingBlock(0) == BBLatch) {
	Rdx = P->getIncomingValue(0);
	} else if (P->getIncomingBlock(1) == BBLatch) {
	Rdx = P->getIncomingValue(1);
	}

	if (Rdx && DominatedReduxValue(Rdx))
	return Rdx;

	return nullptr;
	}

	/// Attempt to reduce a horizontal reduction.
	/// If it is legal to match a horizontal reduction feeding the phi node \a P
	/// with reduction operators \a Root (or one of its operands) in a basic block
	/// \a BB, then check if it can be done. If horizontal reduction is not found
	/// and root instruction is a binary operation, vectorization of the operands is
	/// attempted.
	/// \returns true if a horizontal reduction was matched and reduced or operands
	/// of one of the binary instruction were vectorized.
	/// \returns false if a horizontal reduction was not matched (or not possible)
	/// or no vectorization of any binary operation feeding \a Root instruction was
	/// performed.
	static bool tryToVectorizeHorReductionOrInstOperands(
	PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,
	TargetTransformInfo *TTI,
	const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
	if (!ShouldVectorizeHor)
	return false;

	if (!Root)
	return false;

	if (Root->getParent() != BB \|\| isa<PHINode>(Root))
	return false;
	// Start analysis starting from Root instruction. If horizontal reduction is
	// found, try to vectorize it. If it is not a horizontal reduction or
	// vectorization is not possible or not effective, and currently analyzed
	// instruction is a binary operation, try to vectorize the operands, using
	// pre-order DFS traversal order. If the operands were not vectorized, repeat
	// the same procedure considering each operand as a possible root of the
	// horizontal reduction.
	// Interrupt the process if the Root instruction itself was vectorized or all
	// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
	SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
	SmallSet<Value *, 8> VisitedInstrs;
	bool Res = false;
	while (!Stack.empty()) {
	Value *V;
	unsigned Level;
	std::tie(V, Level) = Stack.pop_back_val();
	if (!V)
	continue;
	auto *Inst = dyn_cast<Instruction>(V);
	if (!Inst)
	continue;
	auto *BI = dyn_cast<BinaryOperator>(Inst);
	auto *SI = dyn_cast<SelectInst>(Inst);
	if (BI \|\| SI) {
	HorizontalReduction HorRdx;
	if (HorRdx.matchAssociativeReduction(P, Inst)) {
	if (HorRdx.tryToReduce(R, TTI)) {
	Res = true;
	// Set P to nullptr to avoid re-analysis of phi node in
	// matchAssociativeReduction function unless this is the root node.
	P = nullptr;
	continue;
	}
	}
	if (P && BI) {
	Inst = dyn_cast<Instruction>(BI->getOperand(0));
	if (Inst == P)
	Inst = dyn_cast<Instruction>(BI->getOperand(1));
	if (!Inst) {
	// Set P to nullptr to avoid re-analysis of phi node in
	// matchAssociativeReduction function unless this is the root node.
	P = nullptr;
	continue;
	}
	}
	}
	// Set P to nullptr to avoid re-analysis of phi node in
	// matchAssociativeReduction function unless this is the root node.
	P = nullptr;
	if (Vectorize(Inst, R)) {
	Res = true;
	continue;
	}

	// Try to vectorize operands.
	// Continue analysis for the instruction from the same basic block only to
	// save compile time.
	if (++Level < RecursionMaxDepth)
	for (auto *Op : Inst->operand_values())
	if (VisitedInstrs.insert(Op).second)
	if (auto *I = dyn_cast<Instruction>(Op))
	if (!isa<PHINode>(I) && I->getParent() == BB)
	Stack.emplace_back(Op, Level);
	}
	return Res;
	}

	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Value V,
	BasicBlock *BB, BoUpSLP &R,
	TargetTransformInfo *TTI) {
	if (!V)
	return false;
	auto *I = dyn_cast<Instruction>(V);
	if (!I)
	return false;

	if (!isa<BinaryOperator>(I))
	P = nullptr;
	// Try to match and vectorize a horizontal reduction.
	auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
	return tryToVectorize(I, R);
	};
	return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
	ExtraVectorization);
	}

	bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
	BasicBlock *BB, BoUpSLP &R) {
	const DataLayout &DL = BB->getModule()->getDataLayout();
	if (!R.canMapToVector(IVI->getType(), DL))
	return false;

	- SmallVector<Value *, 16> BuildVector;
	SmallVector<Value *, 16> BuildVectorOpds;
	- if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
	+ if (!findBuildAggregate(IVI, BuildVectorOpds))
	return false;

	DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
	// Aggregate value is unlikely to be processed in vector register, we need to
	// extract scalars into scalar registers, so NeedExtraction is set true.
	- return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
	+ return tryToVectorizeList(BuildVectorOpds, R);
	}

	bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
	BasicBlock *BB, BoUpSLP &R) {
	- SmallVector<Value *, 16> BuildVector;
	SmallVector<Value *, 16> BuildVectorOpds;
	- if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
	+ if (!findBuildVector(IEI, BuildVectorOpds))
	return false;

	// Vectorize starting with the build vector operands ignoring the BuildVector
	// instructions for the purpose of scheduling and user extraction.
	- return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
	+ return tryToVectorizeList(BuildVectorOpds, R);
	}

	bool SLPVectorizerPass::vectorizeCmpInst(CmpInst CI, BasicBlock BB,
	BoUpSLP &R) {
	if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
	return true;

	bool OpsChanged = false;
	for (int Idx = 0; Idx < 2; ++Idx) {
	OpsChanged \|=
	vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
	}
	return OpsChanged;
	}

	bool SLPVectorizerPass::vectorizeSimpleInstructions(
	SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
	bool OpsChanged = false;
	for (auto &VH : reverse(Instructions)) {
	auto *I = dyn_cast_or_null<Instruction>(VH);
	if (!I)
	continue;
	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
	OpsChanged \|= vectorizeInsertValueInst(LastInsertValue, BB, R);
	else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
	OpsChanged \|= vectorizeInsertElementInst(LastInsertElem, BB, R);
	else if (auto *CI = dyn_cast<CmpInst>(I))
	OpsChanged \|= vectorizeCmpInst(CI, BB, R);
	}
	Instructions.clear();
	return OpsChanged;
	}

	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
	bool Changed = false;
	SmallVector<Value *, 4> Incoming;
	SmallSet<Value *, 16> VisitedInstrs;

	bool HaveVectorizedPhiNodes = true;
	while (HaveVectorizedPhiNodes) {
	HaveVectorizedPhiNodes = false;

	// Collect the incoming values from the PHIs.
	Incoming.clear();
	for (Instruction &I : *BB) {
	PHINode *P = dyn_cast<PHINode>(&I);
	if (!P)
	break;

	if (!VisitedInstrs.count(P))
	Incoming.push_back(P);
	}

	// Sort by type.
	std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);

	// Try to vectorize elements base on their type.
	for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
	E = Incoming.end();
	IncIt != E;) {

	// Look for the next elements with the same type.
	SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
	while (SameTypeIt != E &&
	(SameTypeIt)->getType() == (IncIt)->getType()) {
	VisitedInstrs.insert(*SameTypeIt);
	++SameTypeIt;
	}

	// Try to vectorize them.
	unsigned NumElts = (SameTypeIt - IncIt);
	DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
	// The order in which the phi nodes appear in the program does not matter.
	// So allow tryToVectorizeList to reorder them if it is beneficial. This
	// is done when there are exactly two elements since tryToVectorizeList
	// asserts that there are only two values when AllowReorder is true.
	bool AllowReorder = NumElts == 2;
	- if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
	- None, AllowReorder)) {
	+ if (NumElts > 1 &&
	+ tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
	// Success start over because instructions might have been changed.
	HaveVectorizedPhiNodes = true;
	Changed = true;
	break;
	}

	// Start over at the next instruction of a different type (or the end).
	IncIt = SameTypeIt;
	}
	}

	VisitedInstrs.clear();

	SmallVector<WeakVH, 8> PostProcessInstructions;
	SmallDenseSet<Instruction *, 4> KeyNodes;
	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
	// We may go through BB multiple times so skip the one we have checked.
	if (!VisitedInstrs.insert(&*it).second) {
	if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
	vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
	// We would like to start over since some instructions are deleted
	// and the iterator may become invalid value.
	Changed = true;
	it = BB->begin();
	e = BB->end();
	}
	continue;
	}

	if (isa<DbgInfoIntrinsic>(it))
	continue;

	// Try to vectorize reductions that use PHINodes.
	if (PHINode *P = dyn_cast<PHINode>(it)) {
	// Check that the PHI is a reduction PHI.
	if (P->getNumIncomingValues() != 2)
	return Changed;

	// Try to match and vectorize a horizontal reduction.
	if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
	TTI)) {
	Changed = true;
	it = BB->begin();
	e = BB->end();
	continue;
	}
	continue;
	}

	// Ran into an instruction without users, like terminator, or function call
	// with ignored return value, store. Ignore unused instructions (basing on
	// instruction type, except for CallInst and InvokeInst).
	if (it->use_empty() && (it->getType()->isVoidTy() \|\| isa<CallInst>(it) \|\|
	isa<InvokeInst>(it))) {
	KeyNodes.insert(&*it);
	bool OpsChanged = false;
	if (ShouldStartVectorizeHorAtStore \|\| !isa<StoreInst>(it)) {
	for (auto *V : it->operand_values()) {
	// Try to match and vectorize a horizontal reduction.
	OpsChanged \|= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
	}
	}
	// Start vectorization of post-process list of instructions from the
	// top-tree instructions to try to vectorize as many instructions as
	// possible.
	OpsChanged \|= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
	if (OpsChanged) {
	// We would like to start over since some instructions are deleted
	// and the iterator may become invalid value.
	Changed = true;
	it = BB->begin();
	e = BB->end();
	continue;
	}
	}

	if (isa<InsertElementInst>(it) \|\| isa<CmpInst>(it) \|\|
	isa<InsertValueInst>(it))
	PostProcessInstructions.push_back(&*it);

	}

	return Changed;
	}

	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
	auto Changed = false;
	for (auto &Entry : GEPs) {
	// If the getelementptr list has fewer than two elements, there's nothing
	// to do.
	if (Entry.second.size() < 2)
	continue;

	DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
	<< Entry.second.size() << ".\n");

	// We process the getelementptr list in chunks of 16 (like we do for
	// stores) to minimize compile-time.
	for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
	auto Len = std::min<unsigned>(BE - BI, 16);
	auto GEPList = makeArrayRef(&Entry.second[BI], Len);

	// Initialize a set a candidate getelementptrs. Note that we use a
	// SetVector here to preserve program order. If the index computations
	// are vectorizable and begin with loads, we want to minimize the chance
	// of having to reorder them later.
	SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

	// Some of the candidates may have already been vectorized after we
	// initially collected them. If so, the WeakTrackingVHs will have
	// nullified the
	// values, so remove them from the set of candidates.
	Candidates.remove(nullptr);

	// Remove from the set of candidates all pairs of getelementptrs with
	// constant differences. Such getelementptrs are likely not good
	// candidates for vectorization in a bottom-up phase since one can be
	// computed from the other. We also ensure all candidate getelementptr
	// indices are unique.
	for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
	auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
	if (!Candidates.count(GEPI))
	continue;
	auto *SCEVI = SE->getSCEV(GEPList[I]);
	for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
	auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
	auto *SCEVJ = SE->getSCEV(GEPList[J]);
	if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
	Candidates.remove(GEPList[I]);
	Candidates.remove(GEPList[J]);
	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
	Candidates.remove(GEPList[J]);
	}
	}
	}

	// We break out of the above computation as soon as we know there are
	// fewer than two candidates remaining.
	if (Candidates.size() < 2)
	continue;

	// Add the single, non-constant index of each candidate to the bundle. We
	// ensured the indices met these constraints when we originally collected
	// the getelementptrs.
	SmallVector<Value *, 16> Bundle(Candidates.size());
	auto BundleIndex = 0u;
	for (auto *V : Candidates) {
	auto *GEP = cast<GetElementPtrInst>(V);
	auto *GEPIdx = GEP->idx_begin()->get();
	assert(GEP->getNumIndices() == 1 \|\| !isa<Constant>(GEPIdx));
	Bundle[BundleIndex++] = GEPIdx;
	}

	// Try and vectorize the indices. We are currently only interested in
	// gather-like cases of the form:
	//
	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
	//
	// where the loads of "a", the loads of "b", and the subtractions can be
	// performed in parallel. It's likely that detecting this pattern in a
	// bottom-up phase will be simpler and less costly than building a
	// full-blown top-down phase beginning at the consecutive loads.
	Changed \|= tryToVectorizeList(Bundle, R);
	}
	}
	return Changed;
	}

	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
	bool Changed = false;
	// Attempt to sort and vectorize each of the store-groups.
	for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
	++it) {
	if (it->second.size() < 2)
	continue;

	DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
	<< it->second.size() << ".\n");

	// Process the stores in chunks of 16.
	// TODO: The limit of 16 inhibits greater vectorization factors.
	// For example, AVX2 supports v32i8. Increasing this limit, however,
	// may cause a significant compile-time increase.
	for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
	unsigned Len = std::min<unsigned>(CE - CI, 16);
	Changed \|= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
	}
	}
	return Changed;
	}

	char SLPVectorizer::ID = 0;

	static const char lv_name[] = "SLP Vectorizer";

	INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
	INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
	INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)

	Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
	Index: vendor/llvm/dist-release_60/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir (revision 328362)
	@@ -0,0 +1,61 @@
	+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
	+# RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - \| FileCheck %s
	+--- \|
	+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
	+
	+ @foo1 = common global [1073741824 x i32] zeroinitializer, align 4
	+ @foo2 = common global [1073741824 x i32] zeroinitializer, align 4
	+
	+ define i32 @gv_large() {
	+ entry:
	+ %retval = alloca i32, align 4
	+ store i32 0, i32* %retval, align 4
	+ %0 = load i32, i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0), align 4
	+ %1 = load i32, i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0), align 4
	+ %add = add nsw i32 %0, %1
	+ ret i32 %add
	+ }
	+
	+...
	+---
	+name: gv_large
	+legalized: true
	+regBankSelected: true
	+stack:
	+ - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4,
	+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
	+ di-variable: '', di-expression: '', di-location: '' }
	+constants:
	+body: \|
	+ bb.1:
	+ ; CHECK-LABEL: name: gv_large
	+ ; CHECK: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo1, 0
	+ ; CHECK: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @foo1, 16
	+ ; CHECK: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @foo1, 32
	+ ; CHECK: [[MOVKXi2:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @foo1, 48
	+ ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY [[MOVKXi2]]
	+ ; CHECK: [[MOVZXi1:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo2, 0
	+ ; CHECK: [[MOVKXi3:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi1]], target-flags(aarch64-g1, aarch64-nc) @foo2, 16
	+ ; CHECK: [[MOVKXi4:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi3]], target-flags(aarch64-g2, aarch64-nc) @foo2, 32
	+ ; CHECK: [[MOVKXi5:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi4]], target-flags(aarch64-g3) @foo2, 48
	+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[MOVKXi5]]
	+ ; CHECK: STRWui %wzr, %stack.0.retval, 0 :: (store 4 into %ir.retval)
	+ ; CHECK: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0)`)
	+ ; CHECK: [[LDRWui1:%[0-9]+]]:gpr32 = LDRWui [[COPY1]], 0 :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0)`)
	+ ; CHECK: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRWui]], [[LDRWui1]]
	+ ; CHECK: %w0 = COPY [[ADDWrr]]
	+ ; CHECK: RET_ReallyLR implicit %w0
	+ %1:gpr(s32) = G_CONSTANT i32 0
	+ %4:gpr(p0) = G_GLOBAL_VALUE @foo1
	+ %3:gpr(p0) = COPY %4(p0)
	+ %7:gpr(p0) = G_GLOBAL_VALUE @foo2
	+ %6:gpr(p0) = COPY %7(p0)
	+ %0:gpr(p0) = G_FRAME_INDEX %stack.0.retval
	+ G_STORE %1(s32), %0(p0) :: (store 4 into %ir.retval)
	+ %2:gpr(s32) = G_LOAD %3(p0) :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0)`)
	+ %5:gpr(s32) = G_LOAD %6(p0) :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0)`)
	+ %8:gpr(s32) = G_ADD %2, %5
	+ %w0 = COPY %8(s32)
	+ RET_ReallyLR implicit %w0
	+
	+...
	Index: vendor/llvm/dist-release_60/test/CodeGen/AArch64/atomic-ops-lse.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/AArch64/atomic-ops-lse.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/AArch64/atomic-ops-lse.ll (revision 328362)
	@@ -1,4887 +1,4918 @@
	; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s \| FileCheck %s --check-prefix=CHECK-REG
	; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s \| FileCheck %s

	; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
	; (i.e. reusing a register for status & data in store exclusive).
	; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], w[[NEW]], [x{{[0-9]+}}]
	; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], x[[NEW]], [x{{[0-9]+}}]

	@var8 = global i8 0
	@var16 = global i16 0
	@var32 = global i32 0
	@var64 = global i64 0

	define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i8:
	%old = atomicrmw add i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i16:
	%old = atomicrmw add i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32:
	%old = atomicrmw add i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64:
	%old = atomicrmw add i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_add_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_noret:
	atomicrmw add i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_add_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_noret:
	atomicrmw add i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i8:
	%old = atomicrmw or i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsetalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i16:
	%old = atomicrmw or i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsetalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32:
	%old = atomicrmw or i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64:
	%old = atomicrmw or i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_or_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_noret:
	atomicrmw or i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_or_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_noret:
	atomicrmw or i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i8:
	%old = atomicrmw xor i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldeoralb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i16:
	%old = atomicrmw xor i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldeoralh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32:
	%old = atomicrmw xor i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeoral w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64:
	%old = atomicrmw xor i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeoral x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xor_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_noret:
	atomicrmw xor i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeoral w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_xor_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_noret:
	atomicrmw xor i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeoral x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i8:
	%old = atomicrmw min i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsminalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i16:
	%old = atomicrmw min i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsminalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32:
	%old = atomicrmw min i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64:
	%old = atomicrmw min i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_min_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_noret:
	atomicrmw min i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_min_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_noret:
	atomicrmw min i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i8:
	%old = atomicrmw umin i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: lduminalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i16:
	%old = atomicrmw umin i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: lduminalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32:
	%old = atomicrmw umin i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64:
	%old = atomicrmw umin i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umin_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_noret:
	atomicrmw umin i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umin_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_noret:
	atomicrmw umin i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i8:
	%old = atomicrmw max i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsmaxalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i16:
	%old = atomicrmw max i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsmaxalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32:
	%old = atomicrmw max i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64:
	%old = atomicrmw max i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_max_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_noret:
	atomicrmw max i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_max_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_noret:
	atomicrmw max i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i8:
	%old = atomicrmw umax i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldumaxalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i16:
	%old = atomicrmw umax i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldumaxalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32:
	%old = atomicrmw umax i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64:
	%old = atomicrmw umax i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umax_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_noret:
	atomicrmw umax i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umax_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_noret:
	atomicrmw umax i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i8:
	%old = atomicrmw xchg i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: swpalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i16:
	%old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: swpalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32:
	%old = atomicrmw xchg i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64:
	%old = atomicrmw xchg i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xchg_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_noret:
	atomicrmw xchg i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_xchg_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_noret:
	atomicrmw xchg i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i8:
	%pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
	%old = extractvalue { i8, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
	+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
	+; CHECK-NEXT: casab w0, w1, [x[[ADDR]]]
	+; CHECK-NEXT: ret

	-; CHECK: casab w[[NEW:[0-9]+]], w[[OLD:[0-9]+]], [x[[ADDR]]]
	+ ret i8 %old
	+}
	+
	+define i1 @test_atomic_cmpxchg_i8_1(i8 %wanted, i8 %new) nounwind {
	+; CHECK-LABEL: test_atomic_cmpxchg_i8_1:
	+ %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
	+ %success = extractvalue { i8, i1 } %pair, 1
	+
	; CHECK-NOT: dmb
	+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	- ret i8 %old
	+; CHECK: casab w[[NEW:[0-9]+]], w1, [x[[ADDR]]]
	+; CHECK-NEXT: cmp w[[NEW]], w0, uxtb
	+; CHECK-NEXT: cset w0, eq
	+; CHECK-NEXT: ret
	+ ret i1 %success
	}

	define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i16:
	%pair = cmpxchg i16* @var16, i16 %wanted, i16 %new acquire acquire
	%old = extractvalue { i16, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
	+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
	+; CHECK-NEXT: casah w0, w1, [x[[ADDR]]]
	+; CHECK-NEXT: ret

	-; CHECK: casah w0, w1, [x[[ADDR]]]
	+ ret i16 %old
	+}
	+
	+define i1 @test_atomic_cmpxchg_i16_1(i16 %wanted, i16 %new) nounwind {
	+; CHECK-LABEL: test_atomic_cmpxchg_i16_1:
	+ %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new acquire acquire
	+ %success = extractvalue { i16, i1 } %pair, 1
	+
	; CHECK-NOT: dmb
	+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	- ret i16 %old
	+; CHECK: casah w[[NEW:[0-9]+]], w1, [x[[ADDR]]]
	+; CHECK-NEXT: cmp w[[NEW]], w0, uxth
	+; CHECK-NEXT: cset w0, eq
	+; CHECK-NEXT: ret
	+
	+ ret i1 %success
	}

	define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i32:
	%pair = cmpxchg i32* @var32, i32 %wanted, i32 %new acquire acquire
	%old = extractvalue { i32, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: casa w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i64:
	%pair = cmpxchg i64* @var64, i64 %wanted, i64 %new acquire acquire
	%old = extractvalue { i64, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: casa x0, x1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i8:
	%old = atomicrmw sub i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddalb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i16:
	%old = atomicrmw sub i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddalh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32:
	%old = atomicrmw sub i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64:
	%old = atomicrmw sub i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_sub_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_noret:
	atomicrmw sub i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_sub_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_noret:
	atomicrmw sub i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i8:
	%old = atomicrmw and i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldclralb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i8 %old
	}

	define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i16:
	%old = atomicrmw and i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldclralh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i16 %old
	}

	define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32:
	%old = atomicrmw and i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i32 %old
	}

	define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64:
	%old = atomicrmw and i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i64 %old
	}

	define void @test_atomic_load_and_i32_noret(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_noret:
	atomicrmw and i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_and_i64_noret(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_noret:
	atomicrmw and i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_add_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i8_acq_rel:
	%old = atomicrmw add i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_add_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i16_acq_rel:
	%old = atomicrmw add i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_add_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_acq_rel:
	%old = atomicrmw add i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_add_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_acq_rel:
	%old = atomicrmw add i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_add_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_noret_acq_rel:
	atomicrmw add i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_add_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_noret_acq_rel:
	atomicrmw add i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_add_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i8_acquire:
	%old = atomicrmw add i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_add_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i16_acquire:
	%old = atomicrmw add i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_add_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_acquire:
	%old = atomicrmw add i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldadda w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_add_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_acquire:
	%old = atomicrmw add i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldadda x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_add_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_noret_acquire:
	atomicrmw add i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldadda w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_add_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_noret_acquire:
	atomicrmw add i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldadda x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_add_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i8_monotonic:
	%old = atomicrmw add i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_add_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i16_monotonic:
	%old = atomicrmw add i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_add_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_monotonic:
	%old = atomicrmw add i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldadd w[[OLD:[0-9]+]], w[[NEW:[0-9,a-z]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_add_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_monotonic:
	%old = atomicrmw add i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldadd x[[OLD:[0-9]+]], x[[NEW:[0-9,a-z]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_add_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_noret_monotonic:
	atomicrmw add i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stadd w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_add_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_noret_monotonic:
	atomicrmw add i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stadd x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_add_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i8_release:
	%old = atomicrmw add i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_add_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i16_release:
	%old = atomicrmw add i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_add_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_release:
	%old = atomicrmw add i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_add_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_release:
	%old = atomicrmw add i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_add_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_noret_release:
	atomicrmw add i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: staddl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_add_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_noret_release:
	atomicrmw add i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: staddl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_add_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i8_seq_cst:
	%old = atomicrmw add i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_add_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i16_seq_cst:
	%old = atomicrmw add i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_add_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_seq_cst:
	%old = atomicrmw add i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_add_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_seq_cst:
	%old = atomicrmw add i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_add_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i32_noret_seq_cst:
	atomicrmw add i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_add_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_add_i64_noret_seq_cst:
	atomicrmw add i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_and_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i8_acq_rel:
	%old = atomicrmw and i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldclralb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i8 %old
	}

	define i16 @test_atomic_load_and_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i16_acq_rel:
	%old = atomicrmw and i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldclralh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i16 %old
	}

	define i32 @test_atomic_load_and_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_acq_rel:
	%old = atomicrmw and i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i32 %old
	}

	define i64 @test_atomic_load_and_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_acq_rel:
	%old = atomicrmw and i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i64 %old
	}

	define void @test_atomic_load_and_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_noret_acq_rel:
	atomicrmw and i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_and_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_noret_acq_rel:
	atomicrmw and i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_and_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i8_acquire:
	%old = atomicrmw and i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldclrab w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i8 %old
	}

	define i16 @test_atomic_load_and_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i16_acquire:
	%old = atomicrmw and i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldclrah w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i16 %old
	}

	define i32 @test_atomic_load_and_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_acquire:
	%old = atomicrmw and i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclra w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i32 %old
	}

	define i64 @test_atomic_load_and_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_acquire:
	%old = atomicrmw and i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclra x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i64 %old
	}

	define void @test_atomic_load_and_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_noret_acquire:
	atomicrmw and i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclra w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_and_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_noret_acquire:
	atomicrmw and i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclra x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_and_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i8_monotonic:
	%old = atomicrmw and i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldclrb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i8 %old
	}

	define i16 @test_atomic_load_and_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i16_monotonic:
	%old = atomicrmw and i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldclrh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i16 %old
	}

	define i32 @test_atomic_load_and_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_monotonic:
	%old = atomicrmw and i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclr w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i32 %old
	}

	define i64 @test_atomic_load_and_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_monotonic:
	%old = atomicrmw and i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclr x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i64 %old
	}

	define void @test_atomic_load_and_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_noret_monotonic:
	atomicrmw and i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stclr w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_and_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_noret_monotonic:
	atomicrmw and i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stclr x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_and_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i8_release:
	%old = atomicrmw and i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldclrlb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i8 %old
	}

	define i16 @test_atomic_load_and_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i16_release:
	%old = atomicrmw and i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldclrlh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i16 %old
	}

	define i32 @test_atomic_load_and_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_release:
	%old = atomicrmw and i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclrl w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i32 %old
	}

	define i64 @test_atomic_load_and_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_release:
	%old = atomicrmw and i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclrl x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i64 %old
	}

	define void @test_atomic_load_and_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_noret_release:
	atomicrmw and i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stclrl w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_and_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_noret_release:
	atomicrmw and i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stclrl x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_and_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i8_seq_cst:
	%old = atomicrmw and i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldclralb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i8 %old
	}

	define i16 @test_atomic_load_and_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i16_seq_cst:
	%old = atomicrmw and i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldclralh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i16 %old
	}

	define i32 @test_atomic_load_and_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_seq_cst:
	%old = atomicrmw and i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i32 %old
	}

	define i64 @test_atomic_load_and_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_seq_cst:
	%old = atomicrmw and i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret i64 %old
	}

	define void @test_atomic_load_and_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i32_noret_seq_cst:
	atomicrmw and i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_and_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_and_i64_noret_seq_cst:
	atomicrmw and i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_cmpxchg_i8_acquire(i8 %wanted, i8 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i8_acquire:
	%pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
	%old = extractvalue { i8, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: casab w[[NEW:[0-9]+]], w[[OLD:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_cmpxchg_i16_acquire(i16 %wanted, i16 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i16_acquire:
	%pair = cmpxchg i16* @var16, i16 %wanted, i16 %new acquire acquire
	%old = extractvalue { i16, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: casah w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_cmpxchg_i32_acquire(i32 %wanted, i32 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i32_acquire:
	%pair = cmpxchg i32* @var32, i32 %wanted, i32 %new acquire acquire
	%old = extractvalue { i32, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: casa w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_cmpxchg_i64_acquire(i64 %wanted, i64 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i64_acquire:
	%pair = cmpxchg i64* @var64, i64 %wanted, i64 %new acquire acquire
	%old = extractvalue { i64, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: casa x0, x1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define i8 @test_atomic_cmpxchg_i8_monotonic(i8 %wanted, i8 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i8_monotonic:
	%pair = cmpxchg i8* @var8, i8 %wanted, i8 %new monotonic monotonic
	%old = extractvalue { i8, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: casb w[[NEW:[0-9]+]], w[[OLD:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_cmpxchg_i16_monotonic(i16 %wanted, i16 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i16_monotonic:
	%pair = cmpxchg i16* @var16, i16 %wanted, i16 %new monotonic monotonic
	%old = extractvalue { i16, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: cash w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_cmpxchg_i32_monotonic(i32 %wanted, i32 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i32_monotonic:
	%pair = cmpxchg i32* @var32, i32 %wanted, i32 %new monotonic monotonic
	%old = extractvalue { i32, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: cas w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_cmpxchg_i64_monotonic(i64 %wanted, i64 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i64_monotonic:
	%pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
	%old = extractvalue { i64, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: cas x0, x1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define i8 @test_atomic_cmpxchg_i8_seq_cst(i8 %wanted, i8 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i8_seq_cst:
	%pair = cmpxchg i8* @var8, i8 %wanted, i8 %new seq_cst seq_cst
	%old = extractvalue { i8, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: casalb w[[NEW:[0-9]+]], w[[OLD:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_cmpxchg_i16_seq_cst(i16 %wanted, i16 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i16_seq_cst:
	%pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
	%old = extractvalue { i16, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: casalh w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_cmpxchg_i32_seq_cst(i32 %wanted, i32 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i32_seq_cst:
	%pair = cmpxchg i32* @var32, i32 %wanted, i32 %new seq_cst seq_cst
	%old = extractvalue { i32, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: casal w0, w1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_cmpxchg_i64_seq_cst(i64 %wanted, i64 %new) nounwind {
	; CHECK-LABEL: test_atomic_cmpxchg_i64_seq_cst:
	%pair = cmpxchg i64* @var64, i64 %wanted, i64 %new seq_cst seq_cst
	%old = extractvalue { i64, i1 } %pair, 0

	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: casal x0, x1, [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define i8 @test_atomic_load_max_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i8_acq_rel:
	%old = atomicrmw max i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsmaxalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_max_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i16_acq_rel:
	%old = atomicrmw max i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsmaxalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_max_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_acq_rel:
	%old = atomicrmw max i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_max_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_acq_rel:
	%old = atomicrmw max i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_max_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_noret_acq_rel:
	atomicrmw max i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_max_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_noret_acq_rel:
	atomicrmw max i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_max_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i8_acquire:
	%old = atomicrmw max i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsmaxab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_max_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i16_acquire:
	%old = atomicrmw max i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsmaxah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_max_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_acquire:
	%old = atomicrmw max i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxa w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_max_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_acquire:
	%old = atomicrmw max i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxa x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_max_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_noret_acquire:
	atomicrmw max i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxa w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_max_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_noret_acquire:
	atomicrmw max i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxa x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_max_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i8_monotonic:
	%old = atomicrmw max i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsmaxb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_max_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i16_monotonic:
	%old = atomicrmw max i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsmaxh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_max_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_monotonic:
	%old = atomicrmw max i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmax w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_max_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_monotonic:
	%old = atomicrmw max i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmax x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_max_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_noret_monotonic:
	atomicrmw max i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stsmax w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_max_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_noret_monotonic:
	atomicrmw max i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stsmax x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_max_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i8_release:
	%old = atomicrmw max i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsmaxlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_max_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i16_release:
	%old = atomicrmw max i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsmaxlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_max_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_release:
	%old = atomicrmw max i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_max_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_release:
	%old = atomicrmw max i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_max_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_noret_release:
	atomicrmw max i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stsmaxl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_max_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_noret_release:
	atomicrmw max i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stsmaxl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_max_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i8_seq_cst:
	%old = atomicrmw max i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsmaxalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_max_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i16_seq_cst:
	%old = atomicrmw max i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsmaxalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_max_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_seq_cst:
	%old = atomicrmw max i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_max_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_seq_cst:
	%old = atomicrmw max i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_max_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i32_noret_seq_cst:
	atomicrmw max i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmaxal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_max_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_max_i64_noret_seq_cst:
	atomicrmw max i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmaxal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_min_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i8_acq_rel:
	%old = atomicrmw min i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsminalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_min_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i16_acq_rel:
	%old = atomicrmw min i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsminalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_min_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_acq_rel:
	%old = atomicrmw min i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_min_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_acq_rel:
	%old = atomicrmw min i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_min_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_noret_acq_rel:
	atomicrmw min i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_min_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_noret_acq_rel:
	atomicrmw min i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_min_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i8_acquire:
	%old = atomicrmw min i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsminab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_min_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i16_acquire:
	%old = atomicrmw min i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsminah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_min_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_acquire:
	%old = atomicrmw min i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmina w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_min_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_acquire:
	%old = atomicrmw min i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmina x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_min_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_noret_acquire:
	atomicrmw min i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmina w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_min_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_noret_acquire:
	atomicrmw min i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmina x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_min_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i8_monotonic:
	%old = atomicrmw min i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsminb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_min_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i16_monotonic:
	%old = atomicrmw min i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsminh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_min_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_monotonic:
	%old = atomicrmw min i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsmin w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_min_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_monotonic:
	%old = atomicrmw min i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsmin x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_min_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_noret_monotonic:
	atomicrmw min i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stsmin w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_min_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_noret_monotonic:
	atomicrmw min i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stsmin x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_min_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i8_release:
	%old = atomicrmw min i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsminlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_min_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i16_release:
	%old = atomicrmw min i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsminlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_min_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_release:
	%old = atomicrmw min i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_min_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_release:
	%old = atomicrmw min i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_min_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_noret_release:
	atomicrmw min i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stsminl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_min_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_noret_release:
	atomicrmw min i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stsminl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_min_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i8_seq_cst:
	%old = atomicrmw min i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsminalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_min_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i16_seq_cst:
	%old = atomicrmw min i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsminalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_min_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_seq_cst:
	%old = atomicrmw min i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_min_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_seq_cst:
	%old = atomicrmw min i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_min_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i32_noret_seq_cst:
	atomicrmw min i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsminal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_min_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_min_i64_noret_seq_cst:
	atomicrmw min i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsminal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_or_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i8_acq_rel:
	%old = atomicrmw or i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsetalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_or_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i16_acq_rel:
	%old = atomicrmw or i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsetalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_or_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_acq_rel:
	%old = atomicrmw or i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_or_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_acq_rel:
	%old = atomicrmw or i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_or_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_noret_acq_rel:
	atomicrmw or i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_or_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_noret_acq_rel:
	atomicrmw or i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_or_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i8_acquire:
	%old = atomicrmw or i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsetab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_or_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i16_acquire:
	%old = atomicrmw or i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsetah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_or_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_acquire:
	%old = atomicrmw or i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldseta w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_or_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_acquire:
	%old = atomicrmw or i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldseta x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_or_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_noret_acquire:
	atomicrmw or i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldseta w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_or_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_noret_acquire:
	atomicrmw or i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldseta x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_or_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i8_monotonic:
	%old = atomicrmw or i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsetb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_or_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i16_monotonic:
	%old = atomicrmw or i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldseth w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_or_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_monotonic:
	%old = atomicrmw or i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldset w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_or_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_monotonic:
	%old = atomicrmw or i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldset x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_or_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_noret_monotonic:
	atomicrmw or i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stset w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_or_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_noret_monotonic:
	atomicrmw or i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stset x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_or_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i8_release:
	%old = atomicrmw or i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsetlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_or_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i16_release:
	%old = atomicrmw or i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsetlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_or_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_release:
	%old = atomicrmw or i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_or_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_release:
	%old = atomicrmw or i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_or_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_noret_release:
	atomicrmw or i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stsetl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_or_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_noret_release:
	atomicrmw or i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stsetl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_or_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i8_seq_cst:
	%old = atomicrmw or i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldsetalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_or_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i16_seq_cst:
	%old = atomicrmw or i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldsetalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_or_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_seq_cst:
	%old = atomicrmw or i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_or_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_seq_cst:
	%old = atomicrmw or i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_or_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i32_noret_seq_cst:
	atomicrmw or i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldsetal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_or_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_or_i64_noret_seq_cst:
	atomicrmw or i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldsetal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_sub_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i8_acq_rel:
	%old = atomicrmw sub i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddalb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_sub_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i16_acq_rel:
	%old = atomicrmw sub i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddalh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_sub_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_acq_rel:
	%old = atomicrmw sub i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_sub_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_acq_rel:
	%old = atomicrmw sub i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_sub_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_noret_acq_rel:
	atomicrmw sub i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_sub_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_noret_acq_rel:
	atomicrmw sub i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_sub_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i8_acquire:
	%old = atomicrmw sub i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddab w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_sub_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i16_acquire:
	%old = atomicrmw sub i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddah w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_sub_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_acquire:
	%old = atomicrmw sub i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldadda w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_sub_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_acquire:
	%old = atomicrmw sub i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldadda x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_sub_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_noret_acquire:
	atomicrmw sub i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldadda w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_sub_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_noret_acquire:
	atomicrmw sub i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldadda x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_sub_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i8_monotonic:
	%old = atomicrmw sub i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_sub_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i16_monotonic:
	%old = atomicrmw sub i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_sub_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_monotonic:
	%old = atomicrmw sub i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldadd w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_sub_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_monotonic:
	%old = atomicrmw sub i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldadd x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_sub_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_noret_monotonic:
	atomicrmw sub i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stadd w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_sub_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_noret_monotonic:
	atomicrmw sub i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stadd x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_sub_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i8_release:
	%old = atomicrmw sub i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddlb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_sub_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i16_release:
	%old = atomicrmw sub i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddlh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_sub_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_release:
	%old = atomicrmw sub i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddl w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_sub_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_release:
	%old = atomicrmw sub i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddl x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_sub_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_noret_release:
	atomicrmw sub i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: staddl w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_sub_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_noret_release:
	atomicrmw sub i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: staddl x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_sub_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i8_seq_cst:
	%old = atomicrmw sub i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldaddalb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_sub_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i16_seq_cst:
	%old = atomicrmw sub i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldaddalh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_sub_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_seq_cst:
	%old = atomicrmw sub i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_sub_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_seq_cst:
	%old = atomicrmw sub i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_sub_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i32_noret_seq_cst:
	atomicrmw sub i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_sub_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_sub_i64_noret_seq_cst:
	atomicrmw sub i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_xchg_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i8_acq_rel:
	%old = atomicrmw xchg i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: swpalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xchg_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i16_acq_rel:
	%old = atomicrmw xchg i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: swpalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xchg_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_acq_rel:
	%old = atomicrmw xchg i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xchg_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_acq_rel:
	%old = atomicrmw xchg i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xchg_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_noret_acq_rel:
	atomicrmw xchg i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_xchg_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_noret_acq_rel:
	atomicrmw xchg i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_xchg_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i8_acquire:
	%old = atomicrmw xchg i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: swpab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xchg_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i16_acquire:
	%old = atomicrmw xchg i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: swpah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xchg_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_acquire:
	%old = atomicrmw xchg i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpa w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xchg_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_acquire:
	%old = atomicrmw xchg i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpa x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xchg_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_noret_acquire:
	atomicrmw xchg i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpa w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_xchg_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_noret_acquire:
	atomicrmw xchg i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpa x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_xchg_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i8_monotonic:
	%old = atomicrmw xchg i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: swpb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xchg_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i16_monotonic:
	%old = atomicrmw xchg i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: swph w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xchg_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_monotonic:
	%old = atomicrmw xchg i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swp w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xchg_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_monotonic:
	%old = atomicrmw xchg i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swp x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xchg_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_noret_monotonic:
	atomicrmw xchg i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swp w[[OLD:[0-9]+]], w[[NEW:[0-9,a-z]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_xchg_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_noret_monotonic:
	atomicrmw xchg i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swp x[[OLD:[0-9]+]], x[[NEW:[0-9,a-z]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_xchg_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i8_release:
	%old = atomicrmw xchg i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: swplb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xchg_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i16_release:
	%old = atomicrmw xchg i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: swplh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xchg_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_release:
	%old = atomicrmw xchg i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xchg_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_release:
	%old = atomicrmw xchg i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xchg_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_noret_release:
	atomicrmw xchg i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpl w[[OLD:[0-9]+]], w[[NEW:[0-9,a-z]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_xchg_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_noret_release:
	atomicrmw xchg i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpl x[[OLD:[0-9]+]], x[[NEW:[0-9,a-z]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_xchg_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i8_seq_cst:
	%old = atomicrmw xchg i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: swpalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xchg_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i16_seq_cst:
	%old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: swpalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xchg_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_seq_cst:
	%old = atomicrmw xchg i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xchg_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_seq_cst:
	%old = atomicrmw xchg i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xchg_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i32_noret_seq_cst:
	atomicrmw xchg i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: swpal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define void @test_atomic_load_xchg_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xchg_i64_noret_seq_cst:
	atomicrmw xchg i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: swpal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret void
	}

	define i8 @test_atomic_load_umax_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i8_acq_rel:
	%old = atomicrmw umax i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldumaxalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umax_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i16_acq_rel:
	%old = atomicrmw umax i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldumaxalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umax_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_acq_rel:
	%old = atomicrmw umax i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umax_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_acq_rel:
	%old = atomicrmw umax i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umax_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_noret_acq_rel:
	atomicrmw umax i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umax_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_noret_acq_rel:
	atomicrmw umax i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umax_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i8_acquire:
	%old = atomicrmw umax i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldumaxab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umax_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i16_acquire:
	%old = atomicrmw umax i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldumaxah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umax_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_acquire:
	%old = atomicrmw umax i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxa w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umax_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_acquire:
	%old = atomicrmw umax i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxa x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umax_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_noret_acquire:
	atomicrmw umax i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxa w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umax_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_noret_acquire:
	atomicrmw umax i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxa x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umax_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i8_monotonic:
	%old = atomicrmw umax i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldumaxb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umax_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i16_monotonic:
	%old = atomicrmw umax i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldumaxh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umax_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_monotonic:
	%old = atomicrmw umax i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumax w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umax_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_monotonic:
	%old = atomicrmw umax i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumax x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umax_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_noret_monotonic:
	atomicrmw umax i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stumax w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umax_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_noret_monotonic:
	atomicrmw umax i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stumax x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umax_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i8_release:
	%old = atomicrmw umax i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldumaxlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umax_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i16_release:
	%old = atomicrmw umax i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldumaxlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umax_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_release:
	%old = atomicrmw umax i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umax_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_release:
	%old = atomicrmw umax i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umax_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_noret_release:
	atomicrmw umax i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stumaxl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umax_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_noret_release:
	atomicrmw umax i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stumaxl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umax_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i8_seq_cst:
	%old = atomicrmw umax i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldumaxalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umax_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i16_seq_cst:
	%old = atomicrmw umax i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldumaxalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umax_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_seq_cst:
	%old = atomicrmw umax i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umax_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_seq_cst:
	%old = atomicrmw umax i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umax_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i32_noret_seq_cst:
	atomicrmw umax i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumaxal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umax_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umax_i64_noret_seq_cst:
	atomicrmw umax i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumaxal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umin_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i8_acq_rel:
	%old = atomicrmw umin i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: lduminalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umin_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i16_acq_rel:
	%old = atomicrmw umin i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: lduminalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umin_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_acq_rel:
	%old = atomicrmw umin i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umin_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_acq_rel:
	%old = atomicrmw umin i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umin_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_noret_acq_rel:
	atomicrmw umin i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umin_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_noret_acq_rel:
	atomicrmw umin i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umin_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i8_acquire:
	%old = atomicrmw umin i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: lduminab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umin_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i16_acquire:
	%old = atomicrmw umin i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: lduminah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umin_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_acquire:
	%old = atomicrmw umin i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumina w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umin_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_acquire:
	%old = atomicrmw umin i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumina x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umin_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_noret_acquire:
	atomicrmw umin i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumina w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umin_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_noret_acquire:
	atomicrmw umin i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumina x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umin_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i8_monotonic:
	%old = atomicrmw umin i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: lduminb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umin_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i16_monotonic:
	%old = atomicrmw umin i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: lduminh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umin_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_monotonic:
	%old = atomicrmw umin i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldumin w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umin_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_monotonic:
	%old = atomicrmw umin i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldumin x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umin_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_noret_monotonic:
	atomicrmw umin i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stumin w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umin_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_noret_monotonic:
	atomicrmw umin i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stumin x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umin_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i8_release:
	%old = atomicrmw umin i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: lduminlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umin_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i16_release:
	%old = atomicrmw umin i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: lduminlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umin_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_release:
	%old = atomicrmw umin i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umin_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_release:
	%old = atomicrmw umin i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umin_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_noret_release:
	atomicrmw umin i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: stuminl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umin_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_noret_release:
	atomicrmw umin i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: stuminl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_umin_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i8_seq_cst:
	%old = atomicrmw umin i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: lduminalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_umin_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i16_seq_cst:
	%old = atomicrmw umin i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: lduminalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_umin_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_seq_cst:
	%old = atomicrmw umin i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_umin_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_seq_cst:
	%old = atomicrmw umin i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_umin_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i32_noret_seq_cst:
	atomicrmw umin i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: lduminal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_umin_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_umin_i64_noret_seq_cst:
	atomicrmw umin i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: lduminal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xor_i8_acq_rel(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i8_acq_rel:
	%old = atomicrmw xor i8* @var8, i8 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldeoralb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xor_i16_acq_rel(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i16_acq_rel:
	%old = atomicrmw xor i16* @var16, i16 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldeoralh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xor_i32_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_acq_rel:
	%old = atomicrmw xor i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeoral w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xor_i64_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_acq_rel:
	%old = atomicrmw xor i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeoral x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xor_i32_noret_acq_rel(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_noret_acq_rel:
	atomicrmw xor i32* @var32, i32 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeoral w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_xor_i64_noret_acq_rel(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_noret_acq_rel:
	atomicrmw xor i64* @var64, i64 %offset acq_rel
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeoral x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xor_i8_acquire(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i8_acquire:
	%old = atomicrmw xor i8* @var8, i8 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldeorab w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xor_i16_acquire(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i16_acquire:
	%old = atomicrmw xor i16* @var16, i16 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldeorah w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xor_i32_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_acquire:
	%old = atomicrmw xor i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeora w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xor_i64_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_acquire:
	%old = atomicrmw xor i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeora x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xor_i32_noret_acquire(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_noret_acquire:
	atomicrmw xor i32* @var32, i32 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeora w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_xor_i64_noret_acquire(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_noret_acquire:
	atomicrmw xor i64* @var64, i64 %offset acquire
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeora x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xor_i8_monotonic(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i8_monotonic:
	%old = atomicrmw xor i8* @var8, i8 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldeorb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xor_i16_monotonic(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i16_monotonic:
	%old = atomicrmw xor i16* @var16, i16 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldeorh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xor_i32_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_monotonic:
	%old = atomicrmw xor i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeor w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xor_i64_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_monotonic:
	%old = atomicrmw xor i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeor x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xor_i32_noret_monotonic(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_noret_monotonic:
	atomicrmw xor i32* @var32, i32 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: steor w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_xor_i64_noret_monotonic(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_noret_monotonic:
	atomicrmw xor i64* @var64, i64 %offset monotonic
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: steor x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xor_i8_release(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i8_release:
	%old = atomicrmw xor i8* @var8, i8 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldeorlb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xor_i16_release(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i16_release:
	%old = atomicrmw xor i16* @var16, i16 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldeorlh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xor_i32_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_release:
	%old = atomicrmw xor i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeorl w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xor_i64_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_release:
	%old = atomicrmw xor i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeorl x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xor_i32_noret_release(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_noret_release:
	atomicrmw xor i32* @var32, i32 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: steorl w0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_xor_i64_noret_release(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_noret_release:
	atomicrmw xor i64* @var64, i64 %offset release
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: steorl x0, [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define i8 @test_atomic_load_xor_i8_seq_cst(i8 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i8_seq_cst:
	%old = atomicrmw xor i8* @var8, i8 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8

	; CHECK: ldeoralb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i8 %old
	}

	define i16 @test_atomic_load_xor_i16_seq_cst(i16 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i16_seq_cst:
	%old = atomicrmw xor i16* @var16, i16 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16

	; CHECK: ldeoralh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i16 %old
	}

	define i32 @test_atomic_load_xor_i32_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_seq_cst:
	%old = atomicrmw xor i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeoral w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i32 %old
	}

	define i64 @test_atomic_load_xor_i64_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_seq_cst:
	%old = atomicrmw xor i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeoral x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb

	ret i64 %old
	}

	define void @test_atomic_load_xor_i32_noret_seq_cst(i32 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i32_noret_seq_cst:
	atomicrmw xor i32* @var32, i32 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32

	; CHECK: ldeoral w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}

	define void @test_atomic_load_xor_i64_noret_seq_cst(i64 %offset) nounwind {
	; CHECK-LABEL: test_atomic_load_xor_i64_noret_seq_cst:
	atomicrmw xor i64* @var64, i64 %offset seq_cst
	; CHECK-NOT: dmb
	; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
	; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64

	; CHECK: ldeoral x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
	; CHECK-NOT: dmb
	ret void
	}


	Index: vendor/llvm/dist-release_60/test/CodeGen/AMDGPU/multilevel-break.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/AMDGPU/multilevel-break.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/AMDGPU/multilevel-break.ll (revision 328362)
	@@ -1,115 +1,116 @@
	; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s \| FileCheck -check-prefix=OPT %s
	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; OPT-LABEL: {{^}}define amdgpu_vs void @multi_else_break(
	; OPT: main_body:
	; OPT: LOOP.outer:
	; OPT: LOOP:
	; OPT: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(
	; OPT: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1
	;
	; OPT: Flow:
	;
	; Ensure two else.break calls, for both the inner and outer loops

	; OPT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
	; OPT-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
	; OPT-NEXT: call void @llvm.amdgcn.end.cf
	;
	; OPT: Flow1:

	; GCN-LABEL: {{^}}multi_else_break:

	; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}

	; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
	; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc

	; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
	; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2

	; Ensure extra or eliminated
	; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
	; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
	; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
	; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]

	; GCN: ; %bb.{{[0-9]+}}: ; %Flow1{{$}}
	; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1

	; Ensure copy is eliminated
	; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
	; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
	; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
	; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
	define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
	main_body:
	br label %LOOP.outer

	LOOP.outer: ; preds = %ENDIF, %main_body
	%tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]
	br label %LOOP

	LOOP: ; preds = %ENDIF, %LOOP.outer
	%tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
	%tmp47 = add i32 %tmp45, 1
	%tmp48 = icmp slt i32 %tmp45, %ub
	br i1 %tmp48, label %ENDIF, label %IF

	IF: ; preds = %LOOP
	ret void

	ENDIF: ; preds = %LOOP
	%tmp51 = icmp eq i32 %tmp47, %cont
	br i1 %tmp51, label %LOOP, label %LOOP.outer
	}

	; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
	; OPT: llvm.amdgcn.break
	-; OPT: llvm.amdgcn.loop
	+; OPT: llvm.amdgcn.break
	; OPT: llvm.amdgcn.if.break
	; OPT: llvm.amdgcn.if.break
	+; OPT: llvm.amdgcn.loop
	; OPT: llvm.amdgcn.end.cf

	; GCN-LABEL: {{^}}multi_if_break_loop:
	; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}

	; Uses a copy intsead of an or
	; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
	; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]]
	define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
	bb:
	%id = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp = sub i32 %id, %arg
	br label %bb1

	bb1:
	%lsr.iv = phi i32 [ undef, %bb ], [ %lsr.iv.next, %case0 ], [ %lsr.iv.next, %case1 ]
	%lsr.iv.next = add i32 %lsr.iv, 1
	%cmp0 = icmp slt i32 %lsr.iv.next, 0
	%load0 = load volatile i32, i32 addrspace(1)* undef, align 4
	switch i32 %load0, label %bb9 [
	i32 0, label %case0
	i32 1, label %case1
	]

	case0:
	%load1 = load volatile i32, i32 addrspace(1)* undef, align 4
	%cmp1 = icmp slt i32 %tmp, %load1
	br i1 %cmp1, label %bb1, label %bb9

	case1:
	%load2 = load volatile i32, i32 addrspace(1)* undef, align 4
	%cmp2 = icmp slt i32 %tmp, %load2
	br i1 %cmp2, label %bb1, label %bb9

	bb9:
	ret void
	}

	declare i32 @llvm.amdgcn.workitem.id.x() #1

	attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }
	Index: vendor/llvm/dist-release_60/test/CodeGen/AMDGPU/nested-loop-conditions.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/AMDGPU/nested-loop-conditions.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/AMDGPU/nested-loop-conditions.ll (revision 328362)
	@@ -1,269 +1,314 @@
	; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s \| FileCheck -check-prefix=IR %s
	; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; After structurizing, there are 3 levels of loops. The i1 phi
	; conditions mutually depend on each other, so it isn't safe to delete
	; the condition that appears to have no uses until the loop is
	; completely processed.


	; IR-LABEL: @reduced_nested_loop_conditions(

	; IR: bb5:
	; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ]
	; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
	; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1
	; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7)
	; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
	; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
	; IR-NEXT: br i1 %1, label %bb8, label %Flow

	; IR: bb8:
	; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
	; IR-NEXT: br label %bb13

	; IR: bb10:
	; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ]
	; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ]
	; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
	; IR-NEXT: br i1 %4, label %bb23, label %bb5

	; IR: Flow:
	; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ]
	; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
	; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1)
	; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2)
	; IR-NEXT: br label %bb10

	; IR: bb13:
	; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ]
	; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ]
	; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32>
	; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20

	; IR: bb16:
	; IR-NEXT: %tmp17 = extractelement <2 x i32> %tmp15, i64 1
	; IR-NEXT: %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
	; IR-NEXT: %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
	; IR-NEXT: br label %bb20

	; IR: bb20:
	; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ]
	; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ]
	; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
	; IR-NEXT: br label %bb9

	; IR: bb23:
	; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
	; IR-NEXT: ret void

	; GCN-LABEL: {{^}}reduced_nested_loop_conditions:

	; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 1
	; GCN-NEXT: s_cbranch_scc1

	; FIXME: Should fold to unconditional branch?
	; GCN: s_mov_b64 vcc, -1
	; GCN-NEXT: ; implicit-def
	; GCN: s_cbranch_vccz

	; GCN: ds_read_b32

	; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9
	; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
	; GCN-NEXT: s_branch [[BB9]]
	define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
	bb:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %tmp
	%tmp2 = load volatile i64, i64 addrspace(3)* %tmp1
	br label %bb5

	bb3: ; preds = %bb9
	br i1 true, label %bb4, label %bb13

	bb4: ; preds = %bb3
	br label %bb10

	bb5: ; preds = %bb10, %bb
	%tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
	%tmp7 = icmp eq i32 %tmp6, 1
	br i1 %tmp7, label %bb8, label %bb10

	bb8: ; preds = %bb5
	br label %bb13

	bb9: ; preds = %bb20, %bb9
	br i1 false, label %bb3, label %bb9

	bb10: ; preds = %bb5, %bb4
	%tmp11 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
	%tmp12 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
	br i1 %tmp12, label %bb23, label %bb5

	bb13: ; preds = %bb8, %bb3
	%tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
	%tmp15 = bitcast i64 %tmp2 to <2 x i32>
	br i1 %tmp14, label %bb16, label %bb20

	bb16: ; preds = %bb13
	%tmp17 = extractelement <2 x i32> %tmp15, i64 1
	%tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
	%tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
	br label %bb20

	bb20: ; preds = %bb16, %bb13
	%tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
	%tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
	br label %bb9

	bb23: ; preds = %bb10
	ret void
	}

	; Earlier version of above, before a run of the structurizer.
	; IR-LABEL: @nested_loop_conditions(

	-; IR: Flow7:
	-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
	-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
	-; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
	-; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
	-; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
	+; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
	+; IR: br i1 %tmp1235, label %bb14.lr.ph, label %Flow

	+; IR: bb14.lr.ph:
	+; IR: br label %bb14
	+
	+; IR: Flow3:
	+; IR: call void @llvm.amdgcn.end.cf(i64 %18)
	+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
	+; IR: %1 = extractvalue { i1, i64 } %0, 0
	+; IR: %2 = extractvalue { i1, i64 } %0, 1
	+; IR: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
	+
	+; IR: bb4.bb13_crit_edge:
	+; IR: br label %Flow4
	+
	+; IR: Flow4:
	+; IR: %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
	+; IR: call void @llvm.amdgcn.end.cf(i64 %2)
	+; IR: br label %Flow
	+
	+; IR: bb13:
	+; IR: br label %bb31
	+
	+; IR: Flow:
	+; IR: %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
	+; IR: %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
	+; IR: %6 = extractvalue { i1, i64 } %5, 0
	+; IR: %7 = extractvalue { i1, i64 } %5, 1
	+; IR: br i1 %6, label %bb13, label %bb31
	+
	+; IR: bb14:
	+; IR: %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
	+; IR: %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
	+; IR: %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
	+; IR: %tmp15 = icmp eq i32 %tmp1037, 1
	+; IR: %8 = xor i1 %tmp15, true
	+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
	+; IR: %10 = extractvalue { i1, i64 } %9, 0
	+; IR: %11 = extractvalue { i1, i64 } %9, 1
	+; IR: br i1 %10, label %bb31.loopexit, label %Flow1
	+
	; IR: Flow1:
	-; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
	-; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
	-; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
	-; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
	-; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
	-; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
	-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
	-; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
	-; IR-NEXT: br i1 %18, label %Flow7, label %bb14
	+; IR: %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
	+; IR: %13 = extractvalue { i1, i64 } %12, 0
	+; IR: %14 = extractvalue { i1, i64 } %12, 1
	+; IR: br i1 %13, label %bb16, label %Flow2

	+; IR: bb16:
	+; IR: %tmp17 = bitcast i64 %tmp3 to <2 x i32>
	+; IR: br label %bb18
	+
	; IR: Flow2:
	-; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
	-; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
	-; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
	-; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
	-; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
	-; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
	-; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
	-; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
	-; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
	-; IR-NEXT: br i1 %25, label %bb21, label %Flow3
	+; IR: %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
	+; IR: %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
	+; IR: %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
	+; IR: %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
	+; IR: %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
	+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
	+; IR: %19 = call i1 @llvm.amdgcn.loop(i64 %18)
	+; IR: br i1 %19, label %Flow3, label %bb14

	+; IR: bb18:
	+; IR: %tmp19 = load volatile i32, i32 addrspace(1)* undef
	+; IR: %tmp20 = icmp slt i32 %tmp19, 9
	+; IR: br i1 %tmp20, label %bb21, label %bb18
	+
	; IR: bb21:
	-; IR: %tmp12 = icmp slt i32 %tmp11, 9
	-; IR-NEXT: %27 = xor i1 %tmp12, true
	-; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
	-; IR-NEXT: br label %Flow3
	+; IR: %tmp22 = extractelement <2 x i32> %tmp17, i64 1
	+; IR: %tmp23 = lshr i32 %tmp22, 16
	+; IR: %tmp24 = select i1 undef, i32 undef, i32 %tmp23
	+; IR: %tmp25 = uitofp i32 %tmp24 to float
	+; IR: %tmp26 = fmul float %tmp25, 0x3EF0001000000000
	+; IR: %tmp27 = fsub float %tmp26, undef
	+; IR: %tmp28 = fcmp olt float %tmp27, 5.000000e-01
	+; IR: %tmp29 = select i1 %tmp28, i64 1, i64 2
	+; IR: %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
	+; IR: %tmp7 = zext i32 %tmp30 to i64
	+; IR: %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
	+; IR: %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
	+; IR: %tmp10 = extractelement <4 x i32> %tmp9, i64 0
	+; IR: %tmp11 = load volatile i32, i32 addrspace(1)* undef
	+; IR: %tmp12 = icmp slt i32 %tmp11, 9
	+; IR: %20 = xor i1 %tmp12, true
	+; IR: %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
	+; IR: br label %Flow2

	-; IR: Flow3:
	-; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
	-; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
	-; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
	-; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
	-; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
	-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
	-; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
	+; IR: bb31.loopexit:
	+; IR: br label %Flow1

	; IR: bb31:
	-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
	-; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
	-; IR-NEXT: ret void
	+; IR: call void @llvm.amdgcn.end.cf(i64 %7)
	+; IR: store volatile i32 0, i32 addrspace(1)* undef
	+; IR: ret void


	; GCN-LABEL: {{^}}nested_loop_conditions:

	; GCN: v_cmp_lt_i32_e32 vcc, 8, v
	; GCN: s_and_b64 vcc, exec, vcc
	; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]]

	; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14
	; GCN: v_cmp_ne_u32_e32 vcc, 1, v
	; GCN-NEXT: s_and_b64 vcc, exec, vcc
	; GCN-NEXT: s_cbranch_vccnz [[BB31]]

	; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18
	; GCN: buffer_load_dword
	; GCN: v_cmp_lt_i32_e32 vcc, 8, v
	; GCN-NEXT: s_and_b64 vcc, exec, vcc
	; GCN-NEXT: s_cbranch_vccnz [[BB18]]

	; GCN: buffer_load_dword
	; GCN: buffer_load_dword
	; GCN: v_cmp_gt_i32_e32 vcc, 9
	; GCN-NEXT: s_and_b64 vcc, exec, vcc
	; GCN-NEXT: s_cbranch_vccnz [[BB14]]

	; GCN: [[BB31]]:
	; GCN: buffer_store_dword
	; GCN: s_endpgm
	define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %arg) #0 {
	bb:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%tmp1 = zext i32 %tmp to i64
	%tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp1
	%tmp3 = load i64, i64 addrspace(1)* %tmp2, align 16
	%tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
	%tmp1033 = extractelement <4 x i32> %tmp932, i64 0
	%tmp1134 = load volatile i32, i32 addrspace(1)* undef
	%tmp1235 = icmp slt i32 %tmp1134, 9
	br i1 %tmp1235, label %bb14.lr.ph, label %bb13

	bb14.lr.ph: ; preds = %bb
	br label %bb14

	bb4.bb13_crit_edge: ; preds = %bb21
	br label %bb13

	bb13: ; preds = %bb4.bb13_crit_edge, %bb
	br label %bb31

	bb14: ; preds = %bb21, %bb14.lr.ph
	%tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %tmp10, %bb21 ]
	%tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %tmp9, %bb21 ]
	%tmp15 = icmp eq i32 %tmp1037, 1
	br i1 %tmp15, label %bb16, label %bb31.loopexit

	bb16: ; preds = %bb14
	%tmp17 = bitcast i64 %tmp3 to <2 x i32>
	br label %bb18

	bb18: ; preds = %bb18, %bb16
	%tmp19 = load volatile i32, i32 addrspace(1)* undef
	%tmp20 = icmp slt i32 %tmp19, 9
	br i1 %tmp20, label %bb21, label %bb18

	bb21: ; preds = %bb18
	%tmp22 = extractelement <2 x i32> %tmp17, i64 1
	%tmp23 = lshr i32 %tmp22, 16
	%tmp24 = select i1 undef, i32 undef, i32 %tmp23
	%tmp25 = uitofp i32 %tmp24 to float
	%tmp26 = fmul float %tmp25, 0x3EF0001000000000
	%tmp27 = fsub float %tmp26, undef
	%tmp28 = fcmp olt float %tmp27, 5.000000e-01
	%tmp29 = select i1 %tmp28, i64 1, i64 2
	%tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
	%tmp7 = zext i32 %tmp30 to i64
	%tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
	%tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
	%tmp10 = extractelement <4 x i32> %tmp9, i64 0
	%tmp11 = load volatile i32, i32 addrspace(1)* undef
	%tmp12 = icmp slt i32 %tmp11, 9
	br i1 %tmp12, label %bb14, label %bb4.bb13_crit_edge

	bb31.loopexit: ; preds = %bb14
	br label %bb31

	bb31: ; preds = %bb31.loopexit, %bb13
	store volatile i32 0, i32 addrspace(1)* undef
	ret void
	}

	declare i32 @llvm.amdgcn.workitem.id.x() #1

	attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }
	Index: vendor/llvm/dist-release_60/test/CodeGen/ARM/and-load-combine.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/ARM/and-load-combine.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/ARM/and-load-combine.ll (revision 328362)
	@@ -1,1069 +1,1065 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=armv7 %s -o - \| FileCheck %s --check-prefix=ARM
	; RUN: llc -mtriple=armv7eb %s -o - \| FileCheck %s --check-prefix=ARMEB
	; RUN: llc -mtriple=armv6m %s -o - \| FileCheck %s --check-prefix=THUMB1
	; RUN: llc -mtriple=thumbv8m.main %s -o - \| FileCheck %s --check-prefix=THUMB2

	define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a,
	i16* nocapture readonly %b) {
	; ARM-LABEL: cmp_xor8_short_short:
	; ARM: ldrb r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: teq r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_xor8_short_short:
	; ARMEB: ldrb r2, [r0, #1]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrb r1, [r1, #1]
	; ARMEB-NEXT: teq r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_xor8_short_short:
	; THUMB1: ldrb r0, [r0]
	; THUMB1-NEXT: ldrb r2, [r1]
	; THUMB1-NEXT: eors r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB0_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB0_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_xor8_short_short:
	; THUMB2: ldrb r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: teq.w r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %a, align 2
	%1 = load i16, i16* %b, align 2
	%xor2 = xor i16 %1, %0
	%2 = and i16 %xor2, 255
	%cmp = icmp eq i16 %2, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_xor8_short_int:
	; ARM: ldrb r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: teq r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_xor8_short_int:
	; ARMEB: ldrb r2, [r0, #1]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrb r1, [r1, #3]
	; ARMEB-NEXT: teq r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_xor8_short_int:
	; THUMB1: ldrb r0, [r0]
	; THUMB1-NEXT: ldrb r2, [r1]
	; THUMB1-NEXT: eors r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB1_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB1_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_xor8_short_int:
	; THUMB2: ldrb r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: teq.w r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %a, align 2
	%conv = zext i16 %0 to i32
	%1 = load i32, i32* %b, align 4
	%xor = xor i32 %1, %conv
	%and = and i32 %xor, 255
	%cmp = icmp eq i32 %and, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_xor8_int_int:
	; ARM: ldrb r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: teq r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_xor8_int_int:
	; ARMEB: ldrb r2, [r0, #3]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrb r1, [r1, #3]
	; ARMEB-NEXT: teq r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_xor8_int_int:
	; THUMB1: ldrb r0, [r0]
	; THUMB1-NEXT: ldrb r2, [r1]
	; THUMB1-NEXT: eors r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB2_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB2_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_xor8_int_int:
	; THUMB2: ldrb r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: teq.w r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%xor = xor i32 %1, %0
	%and = and i32 %xor, 255
	%cmp = icmp eq i32 %and, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_xor16:
	; ARM: ldrh r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrh r1, [r1]
	; ARM-NEXT: teq r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_xor16:
	; ARMEB: ldrh r2, [r0, #2]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrh r1, [r1, #2]
	; ARMEB-NEXT: teq r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_xor16:
	; THUMB1: ldrh r0, [r0]
	; THUMB1-NEXT: ldrh r2, [r1]
	; THUMB1-NEXT: eors r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB3_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB3_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_xor16:
	; THUMB2: ldrh r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrh r1, [r1]
	; THUMB2-NEXT: teq.w r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%xor = xor i32 %1, %0
	%and = and i32 %xor, 65535
	%cmp = icmp eq i32 %and, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a,
	i16* nocapture readonly %b) {
	; ARM-LABEL: cmp_or8_short_short:
	; ARM: ldrb r0, [r0]
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: orrs r0, r1, r0
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_or8_short_short:
	; ARMEB: ldrb r0, [r0, #1]
	; ARMEB-NEXT: ldrb r1, [r1, #1]
	; ARMEB-NEXT: orrs r0, r1, r0
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_or8_short_short:
	; THUMB1: ldrb r0, [r0]
	; THUMB1-NEXT: ldrb r2, [r1]
	; THUMB1-NEXT: orrs r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB4_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB4_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_or8_short_short:
	; THUMB2: ldrb r0, [r0]
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: mov.w r0, #0
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %a, align 2
	%1 = load i16, i16* %b, align 2
	%or2 = or i16 %1, %0
	%2 = and i16 %or2, 255
	%cmp = icmp eq i16 %2, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_or8_short_int:
	; ARM: ldrb r0, [r0]
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: orrs r0, r1, r0
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_or8_short_int:
	; ARMEB: ldrb r0, [r0, #1]
	; ARMEB-NEXT: ldrb r1, [r1, #3]
	; ARMEB-NEXT: orrs r0, r1, r0
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_or8_short_int:
	; THUMB1: ldrb r0, [r0]
	; THUMB1-NEXT: ldrb r2, [r1]
	; THUMB1-NEXT: orrs r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB5_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB5_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_or8_short_int:
	; THUMB2: ldrb r0, [r0]
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: mov.w r0, #0
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %a, align 2
	%conv = zext i16 %0 to i32
	%1 = load i32, i32* %b, align 4
	%or = or i32 %1, %conv
	%and = and i32 %or, 255
	%cmp = icmp eq i32 %and, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_or8_int_int:
	; ARM: ldrb r0, [r0]
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: orrs r0, r1, r0
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_or8_int_int:
	; ARMEB: ldrb r0, [r0, #3]
	; ARMEB-NEXT: ldrb r1, [r1, #3]
	; ARMEB-NEXT: orrs r0, r1, r0
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_or8_int_int:
	; THUMB1: ldrb r0, [r0]
	; THUMB1-NEXT: ldrb r2, [r1]
	; THUMB1-NEXT: orrs r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB6_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB6_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_or8_int_int:
	; THUMB2: ldrb r0, [r0]
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: mov.w r0, #0
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%or = or i32 %1, %0
	%and = and i32 %or, 255
	%cmp = icmp eq i32 %and, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_or16:
	; ARM: ldrh r0, [r0]
	; ARM-NEXT: ldrh r1, [r1]
	; ARM-NEXT: orrs r0, r1, r0
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_or16:
	; ARMEB: ldrh r0, [r0, #2]
	; ARMEB-NEXT: ldrh r1, [r1, #2]
	; ARMEB-NEXT: orrs r0, r1, r0
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_or16:
	; THUMB1: ldrh r0, [r0]
	; THUMB1-NEXT: ldrh r2, [r1]
	; THUMB1-NEXT: orrs r2, r0
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r2, #0
	; THUMB1-NEXT: beq .LBB7_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB7_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_or16:
	; THUMB2: ldrh r0, [r0]
	; THUMB2-NEXT: ldrh r1, [r1]
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: mov.w r0, #0
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%or = or i32 %1, %0
	%and = and i32 %or, 65535
	%cmp = icmp eq i32 %and, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a,
	i16* nocapture readonly %b) {
	; ARM-LABEL: cmp_and8_short_short:
	; ARM: ldrb r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: tst r2, r1
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_and8_short_short:
	; ARMEB: ldrb r2, [r0, #1]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrb r1, [r1, #1]
	; ARMEB-NEXT: tst r2, r1
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_and8_short_short:
	; THUMB1: ldrb r2, [r1]
	; THUMB1-NEXT: ldrb r3, [r0]
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: tst r3, r2
	; THUMB1-NEXT: beq .LBB8_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB8_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_and8_short_short:
	; THUMB2: ldrb r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: tst r2, r1
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %a, align 2
	%1 = load i16, i16* %b, align 2
	%and3 = and i16 %0, 255
	%2 = and i16 %and3, %1
	%cmp = icmp eq i16 %2, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_and8_short_int:
	; ARM: ldrb r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: tst r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_and8_short_int:
	; ARMEB: ldrb r2, [r0, #1]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrb r1, [r1, #3]
	; ARMEB-NEXT: tst r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_and8_short_int:
	; THUMB1: ldrb r2, [r0]
	; THUMB1-NEXT: ldrb r3, [r1]
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: tst r3, r2
	; THUMB1-NEXT: beq .LBB9_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB9_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_and8_short_int:
	; THUMB2: ldrb r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: tst r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %a, align 2
	%1 = load i32, i32* %b, align 4
	%2 = and i16 %0, 255
	%and = zext i16 %2 to i32
	%and1 = and i32 %1, %and
	%cmp = icmp eq i32 %and1, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_and8_int_int:
	; ARM: ldrb r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrb r1, [r1]
	; ARM-NEXT: tst r2, r1
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_and8_int_int:
	; ARMEB: ldrb r2, [r0, #3]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrb r1, [r1, #3]
	; ARMEB-NEXT: tst r2, r1
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_and8_int_int:
	; THUMB1: ldrb r2, [r1]
	; THUMB1-NEXT: ldrb r3, [r0]
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: tst r3, r2
	; THUMB1-NEXT: beq .LBB10_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB10_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_and8_int_int:
	; THUMB2: ldrb r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrb r1, [r1]
	; THUMB2-NEXT: tst r2, r1
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%and = and i32 %0, 255
	%and1 = and i32 %and, %1
	%cmp = icmp eq i32 %and1, 0
	ret i1 %cmp
	}

	define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a,
	i32* nocapture readonly %b) {
	; ARM-LABEL: cmp_and16:
	; ARM: ldrh r2, [r0]
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: ldrh r1, [r1]
	; ARM-NEXT: tst r2, r1
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: cmp_and16:
	; ARMEB: ldrh r2, [r0, #2]
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: ldrh r1, [r1, #2]
	; ARMEB-NEXT: tst r2, r1
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: cmp_and16:
	; THUMB1: ldrh r2, [r1]
	; THUMB1-NEXT: ldrh r3, [r0]
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: tst r3, r2
	; THUMB1-NEXT: beq .LBB11_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB11_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: cmp_and16:
	; THUMB2: ldrh r2, [r0]
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: ldrh r1, [r1]
	; THUMB2-NEXT: tst r2, r1
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%and = and i32 %0, 65535
	%and1 = and i32 %and, %1
	%cmp = icmp eq i32 %and1, 0
	ret i1 %cmp
	}

	define arm_aapcscc i32 @add_and16(i32* nocapture readonly %a, i32 %y, i32 %z) {
	; ARM-LABEL: add_and16:
	; ARM: add r1, r1, r2
	; ARM-NEXT: ldrh r0, [r0]
	; ARM-NEXT: uxth r1, r1
	; ARM-NEXT: orr r0, r0, r1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: add_and16:
	; ARMEB: add r1, r1, r2
	; ARMEB-NEXT: ldrh r0, [r0, #2]
	; ARMEB-NEXT: uxth r1, r1
	; ARMEB-NEXT: orr r0, r0, r1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: add_and16:
	; THUMB1: adds r1, r1, r2
	; THUMB1-NEXT: uxth r1, r1
	; THUMB1-NEXT: ldrh r0, [r0]
	; THUMB1-NEXT: orrs r0, r1
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: add_and16:
	; THUMB2: add r1, r2
	; THUMB2-NEXT: ldrh r0, [r0]
	; THUMB2-NEXT: uxth r1, r1
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: bx lr
	entry:
	%x = load i32, i32* %a, align 4
	%add = add i32 %y, %z
	%or = or i32 %x, %add
	%and = and i32 %or, 65535
	ret i32 %and
	}

	define arm_aapcscc i32 @test1(i32* %a, i32* %b, i32 %x, i32 %y) {
	; ARM-LABEL: test1:
	; ARM: mul r2, r2, r3
	; ARM-NEXT: ldrh r1, [r1]
	; ARM-NEXT: ldrh r0, [r0]
	; ARM-NEXT: eor r0, r0, r1
	; ARM-NEXT: uxth r1, r2
	; ARM-NEXT: orr r0, r0, r1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test1:
	; ARMEB: mul r2, r2, r3
	; ARMEB-NEXT: ldrh r1, [r1, #2]
	; ARMEB-NEXT: ldrh r0, [r0, #2]
	; ARMEB-NEXT: eor r0, r0, r1
	; ARMEB-NEXT: uxth r1, r2
	; ARMEB-NEXT: orr r0, r0, r1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test1:
	; THUMB1: ldrh r1, [r1]
	; THUMB1-NEXT: ldrh r4, [r0]
	; THUMB1-NEXT: eors r4, r1
	; THUMB1-NEXT: muls r2, r3, r2
	; THUMB1-NEXT: uxth r0, r2
	; THUMB1-NEXT: orrs r0, r4
	; THUMB1-NEXT: pop
	;
	; THUMB2-LABEL: test1:
	; THUMB2: ldrh r1, [r1]
	; THUMB2-NEXT: ldrh r0, [r0]
	; THUMB2-NEXT: eors r0, r1
	; THUMB2-NEXT: mul r1, r2, r3
	; THUMB2-NEXT: uxth r1, r1
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%mul = mul i32 %x, %y
	%xor = xor i32 %0, %1
	%or = or i32 %xor, %mul
	%and = and i32 %or, 65535
	ret i32 %and
	}

	define arm_aapcscc i32 @test2(i32* %a, i32* %b, i32 %x, i32 %y) {
	; ARM-LABEL: test2:
	; ARM: ldr r1, [r1]
	; ARM-NEXT: ldr r0, [r0]
	; ARM-NEXT: mul r1, r2, r1
	; ARM-NEXT: eor r0, r0, r3
	; ARM-NEXT: orr r0, r0, r1
	; ARM-NEXT: uxth r0, r0
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test2:
	; ARMEB: ldr r1, [r1]
	; ARMEB-NEXT: ldr r0, [r0]
	; ARMEB-NEXT: mul r1, r2, r1
	; ARMEB-NEXT: eor r0, r0, r3
	; ARMEB-NEXT: orr r0, r0, r1
	; ARMEB-NEXT: uxth r0, r0
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test2:
	; THUMB1: ldr r1, [r1]
	; THUMB1-NEXT: muls r1, r2, r1
	; THUMB1-NEXT: ldr r0, [r0]
	; THUMB1-NEXT: eors r0, r3
	; THUMB1-NEXT: orrs r0, r1
	; THUMB1-NEXT: uxth r0, r0
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test2:
	; THUMB2: ldr r1, [r1]
	; THUMB2-NEXT: ldr r0, [r0]
	; THUMB2-NEXT: muls r1, r2, r1
	; THUMB2-NEXT: eors r0, r3
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: uxth r0, r0
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%mul = mul i32 %x, %1
	%xor = xor i32 %0, %y
	%or = or i32 %xor, %mul
	%and = and i32 %or, 65535
	ret i32 %and
	}

	define arm_aapcscc i32 @test3(i32* %a, i32* %b, i32 %x, i16* %y) {
	; ARM-LABEL: test3:
	; ARM: ldr r0, [r0]
	; ARM-NEXT: mul r1, r2, r0
	; ARM-NEXT: ldrh r2, [r3]
	; ARM-NEXT: eor r0, r0, r2
	; ARM-NEXT: orr r0, r0, r1
	; ARM-NEXT: uxth r0, r0
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test3:
	; ARMEB: ldr r0, [r0]
	; ARMEB-NEXT: mul r1, r2, r0
	; ARMEB-NEXT: ldrh r2, [r3]
	; ARMEB-NEXT: eor r0, r0, r2
	; ARMEB-NEXT: orr r0, r0, r1
	; ARMEB-NEXT: uxth r0, r0
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test3:
	; THUMB1: ldr r0, [r0]
	; THUMB1-NEXT: muls r2, r0, r2
	; THUMB1-NEXT: ldrh r1, [r3]
	; THUMB1-NEXT: eors r1, r0
	; THUMB1-NEXT: orrs r1, r2
	; THUMB1-NEXT: uxth r0, r1
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test3:
	; THUMB2: ldr r0, [r0]
	; THUMB2-NEXT: mul r1, r2, r0
	; THUMB2-NEXT: ldrh r2, [r3]
	; THUMB2-NEXT: eors r0, r2
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: uxth r0, r0
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i16, i16* %y, align 4
	%2 = zext i16 %1 to i32
	%mul = mul i32 %x, %0
	%xor = xor i32 %0, %2
	%or = or i32 %xor, %mul
	%and = and i32 %or, 65535
	ret i32 %and
	}

	define arm_aapcscc i32 @test4(i32* %a, i32* %b, i32 %x, i32 %y) {
	; ARM-LABEL: test4:
	; ARM: mul r2, r2, r3
	; ARM-NEXT: ldrh r1, [r1]
	; ARM-NEXT: ldrh r0, [r0]
	; ARM-NEXT: eor r0, r0, r1
	; ARM-NEXT: uxth r1, r2
	; ARM-NEXT: orr r0, r0, r1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test4:
	; ARMEB: mul r2, r2, r3
	; ARMEB-NEXT: ldrh r1, [r1, #2]
	; ARMEB-NEXT: ldrh r0, [r0, #2]
	; ARMEB-NEXT: eor r0, r0, r1
	; ARMEB-NEXT: uxth r1, r2
	; ARMEB-NEXT: orr r0, r0, r1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test4:
	; THUMB1: ldrh r1, [r1]
	; THUMB1-NEXT: ldrh r4, [r0]
	; THUMB1-NEXT: eors r4, r1
	; THUMB1-NEXT: muls r2, r3, r2
	; THUMB1-NEXT: uxth r0, r2
	; THUMB1-NEXT: orrs r0, r4
	; THUMB1-NEXT: pop
	;
	; THUMB2-LABEL: test4:
	; THUMB2: ldrh r1, [r1]
	; THUMB2-NEXT: ldrh r0, [r0]
	; THUMB2-NEXT: eors r0, r1
	; THUMB2-NEXT: mul r1, r2, r3
	; THUMB2-NEXT: uxth r1, r1
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%mul = mul i32 %x, %y
	%xor = xor i32 %0, %1
	%or = or i32 %xor, %mul
	%and = and i32 %or, 65535
	ret i32 %and
	}

	define arm_aapcscc i32 @test5(i32* %a, i32* %b, i32 %x, i16 zeroext %y) {
	; ARM-LABEL: test5:
	; ARM: ldr r1, [r1]
	; ARM-NEXT: ldrh r0, [r0]
	; ARM-NEXT: mul r1, r2, r1
	; ARM-NEXT: eor r0, r0, r3
	; ARM-NEXT: uxth r1, r1
	; ARM-NEXT: orr r0, r0, r1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test5:
	; ARMEB: ldr r1, [r1]
	; ARMEB-NEXT: ldrh r0, [r0, #2]
	; ARMEB-NEXT: mul r1, r2, r1
	; ARMEB-NEXT: eor r0, r0, r3
	; ARMEB-NEXT: uxth r1, r1
	; ARMEB-NEXT: orr r0, r0, r1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test5:
	; THUMB1: ldrh r4, [r0]
	; THUMB1-NEXT: eors r4, r3
	; THUMB1-NEXT: ldr r0, [r1]
	; THUMB1-NEXT: muls r0, r2, r0
	; THUMB1-NEXT: uxth r0, r0
	; THUMB1-NEXT: orrs r0, r4
	; THUMB1-NEXT: pop
	;
	; THUMB2-LABEL: test5:
	; THUMB2: ldr r1, [r1]
	; THUMB2-NEXT: ldrh r0, [r0]
	; THUMB2-NEXT: muls r1, r2, r1
	; THUMB2-NEXT: eors r0, r3
	; THUMB2-NEXT: uxth r1, r1
	; THUMB2-NEXT: orrs r0, r1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %a, align 4
	%1 = load i32, i32* %b, align 4
	%mul = mul i32 %x, %1
	%ext = zext i16 %y to i32
	%xor = xor i32 %0, %ext
	%or = or i32 %xor, %mul
	%and = and i32 %or, 65535
	ret i32 %and
	}

	define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
	; ARM-LABEL: test6:
	; ARM: @ %bb.0: @ %entry
	; ARM-NEXT: ldrb r0, [r0]
	; ARM-NEXT: uxtb r2, r2
	-; ARM-NEXT: and r0, r0, r1
	-; ARM-NEXT: uxtb r1, r0
	+; ARM-NEXT: and r1, r0, r1
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: cmp r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test6:
	; ARMEB: @ %bb.0: @ %entry
	; ARMEB-NEXT: ldrb r0, [r0]
	; ARMEB-NEXT: uxtb r2, r2
	-; ARMEB-NEXT: and r0, r0, r1
	-; ARMEB-NEXT: uxtb r1, r0
	+; ARMEB-NEXT: and r1, r0, r1
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: cmp r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test6:
	; THUMB1: @ %bb.0: @ %entry
	-; THUMB1-NEXT: ldrb r0, [r0]
	-; THUMB1-NEXT: ands r0, r1
	-; THUMB1-NEXT: uxtb r3, r0
	+; THUMB1-NEXT: ldrb r3, [r0]
	+; THUMB1-NEXT: ands r3, r1
	; THUMB1-NEXT: uxtb r2, r2
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r3, r2
	; THUMB1-NEXT: beq .LBB18_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB18_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test6:
	; THUMB2: @ %bb.0: @ %entry
	; THUMB2-NEXT: ldrb r0, [r0]
	; THUMB2-NEXT: uxtb r2, r2
	-; THUMB2-NEXT: ands r0, r1
	-; THUMB2-NEXT: uxtb r1, r0
	+; THUMB2-NEXT: ands r1, r0
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: cmp r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i8, i8* %x, align 4
	%1 = and i8 %0, %y
	%2 = icmp eq i8 %1, %z
	ret i1 %2
	}

	define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) {
	; ARM-LABEL: test7:
	; ARM: @ %bb.0: @ %entry
	; ARM-NEXT: ldrb r0, [r0]
	; ARM-NEXT: uxtb r2, r2
	; ARM-NEXT: and r1, r0, r1
	; ARM-NEXT: mov r0, #0
	; ARM-NEXT: cmp r1, r2
	; ARM-NEXT: movweq r0, #1
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test7:
	; ARMEB: @ %bb.0: @ %entry
	; ARMEB-NEXT: ldrb r0, [r0, #1]
	; ARMEB-NEXT: uxtb r2, r2
	; ARMEB-NEXT: and r1, r0, r1
	; ARMEB-NEXT: mov r0, #0
	; ARMEB-NEXT: cmp r1, r2
	; ARMEB-NEXT: movweq r0, #1
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test7:
	; THUMB1: @ %bb.0: @ %entry
	; THUMB1-NEXT: ldrb r3, [r0]
	; THUMB1-NEXT: ands r3, r1
	; THUMB1-NEXT: uxtb r2, r2
	; THUMB1-NEXT: movs r0, #1
	; THUMB1-NEXT: movs r1, #0
	; THUMB1-NEXT: cmp r3, r2
	; THUMB1-NEXT: beq .LBB19_2
	; THUMB1-NEXT: @ %bb.1: @ %entry
	; THUMB1-NEXT: mov r0, r1
	; THUMB1-NEXT: .LBB19_2: @ %entry
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test7:
	; THUMB2: @ %bb.0: @ %entry
	; THUMB2-NEXT: ldrb r0, [r0]
	; THUMB2-NEXT: uxtb r2, r2
	; THUMB2-NEXT: ands r1, r0
	; THUMB2-NEXT: movs r0, #0
	; THUMB2-NEXT: cmp r1, r2
	; THUMB2-NEXT: it eq
	; THUMB2-NEXT: moveq r0, #1
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i16, i16* %x, align 4
	%1 = and i16 %0, %y
	%2 = trunc i16 %1 to i8
	%3 = icmp eq i8 %2, %z
	ret i1 %3
	}

	define arm_aapcscc void @test8(i32* nocapture %p) {
	; ARM-LABEL: test8:
	; ARM: @ %bb.0: @ %entry
	; ARM-NEXT: ldrb r1, [r0]
	; ARM-NEXT: eor r1, r1, #255
	; ARM-NEXT: str r1, [r0]
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test8:
	; ARMEB: @ %bb.0: @ %entry
	; ARMEB-NEXT: ldrb r1, [r0, #3]
	; ARMEB-NEXT: eor r1, r1, #255
	; ARMEB-NEXT: str r1, [r0]
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test8:
	; THUMB1: @ %bb.0: @ %entry
	; THUMB1-NEXT: ldrb r1, [r0]
	; THUMB1-NEXT: movs r2, #255
	; THUMB1-NEXT: eors r2, r1
	; THUMB1-NEXT: str r2, [r0]
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test8:
	; THUMB2: @ %bb.0: @ %entry
	; THUMB2-NEXT: ldrb r1, [r0]
	; THUMB2-NEXT: eor r1, r1, #255
	; THUMB2-NEXT: str r1, [r0]
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %p, align 4
	%neg = and i32 %0, 255
	%and = xor i32 %neg, 255
	store i32 %and, i32* %p, align 4
	ret void
	}

	define arm_aapcscc void @test9(i32* nocapture %p) {
	; ARM-LABEL: test9:
	; ARM: @ %bb.0: @ %entry
	; ARM-NEXT: ldrb r1, [r0]
	; ARM-NEXT: eor r1, r1, #255
	; ARM-NEXT: str r1, [r0]
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test9:
	; ARMEB: @ %bb.0: @ %entry
	; ARMEB-NEXT: ldrb r1, [r0, #3]
	; ARMEB-NEXT: eor r1, r1, #255
	; ARMEB-NEXT: str r1, [r0]
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test9:
	; THUMB1: @ %bb.0: @ %entry
	; THUMB1-NEXT: ldrb r1, [r0]
	; THUMB1-NEXT: movs r2, #255
	; THUMB1-NEXT: eors r2, r1
	; THUMB1-NEXT: str r2, [r0]
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test9:
	; THUMB2: @ %bb.0: @ %entry
	; THUMB2-NEXT: ldrb r1, [r0]
	; THUMB2-NEXT: eor r1, r1, #255
	; THUMB2-NEXT: str r1, [r0]
	; THUMB2-NEXT: bx lr
	entry:
	%0 = load i32, i32* %p, align 4
	%neg = xor i32 %0, -1
	%and = and i32 %neg, 255
	store i32 %and, i32* %p, align 4
	ret void
	}

	; ARM-LABEL: test10:
	; ARM: @ %bb.0: @ %entry
	; ARM-NEXT: ldrb r1, [r0]
	; ARM-NEXT: eor r1, r1, #255
	; ARM-NEXT: str r1, [r0]
	; ARM-NEXT: bx lr
	;
	; ARMEB-LABEL: test10:
	; ARMEB: @ %bb.0: @ %entry
	; ARMEB-NEXT: ldrb r1, [r0, #3]
	; ARMEB-NEXT: eor r1, r1, #255
	; ARMEB-NEXT: str r1, [r0]
	; ARMEB-NEXT: bx lr
	;
	; THUMB1-LABEL: test10:
	; THUMB1: @ %bb.0: @ %entry
	; THUMB1-NEXT: ldrb r1, [r0]
	; THUMB1-NEXT: movs r2, #255
	; THUMB1-NEXT: eors r2, r1
	; THUMB1-NEXT: str r2, [r0]
	; THUMB1-NEXT: bx lr
	;
	; THUMB2-LABEL: test10:
	; THUMB2: @ %bb.0: @ %entry
	; THUMB2-NEXT: ldrb r1, [r0]
	; THUMB2-NEXT: eor r1, r1, #255
	; THUMB2-NEXT: str r1, [r0]
	; THUMB2-NEXT: bx lr
	define arm_aapcscc void @test10(i32* nocapture %p) {
	entry:
	%0 = load i32, i32* %p, align 4
	%neg = and i32 %0, 255
	%and = xor i32 %neg, 255
	store i32 %and, i32* %p, align 4
	ret void
	}

	Index: vendor/llvm/dist-release_60/test/CodeGen/ARM/atomic-cmpxchg.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/ARM/atomic-cmpxchg.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/ARM/atomic-cmpxchg.ll (revision 328362)
	@@ -1,93 +1,94 @@
	; RUN: llc < %s -mtriple=arm-linux-gnueabi -asm-verbose=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK-ARM
	; RUN: llc < %s -mtriple=thumb-linux-gnueabi -asm-verbose=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK-THUMB

	; RUN: llc < %s -mtriple=armv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK-ARMV6
	; RUN: llc < %s -mtriple=thumbv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK-THUMBV6

	; RUN: llc < %s -mtriple=armv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK-ARMV7
	; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK-THUMBV7

	define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) {
	entry:
	%0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic
	%1 = extractvalue { i8, i1 } %0, 1
	ret i1 %1
	}

	; CHECK-ARM-LABEL: test_cmpxchg_res_i8
	; CHECK-ARM: bl __sync_val_compare_and_swap_1
	; CHECK-ARM: mov [[REG:r[0-9]+]], #0
	; CHECK-ARM: cmp r0, {{r[0-9]+}}
	; CHECK-ARM: moveq [[REG]], #1
	; CHECK-ARM: mov r0, [[REG]]

	; CHECK-THUMB-LABEL: test_cmpxchg_res_i8
	; CHECK-THUMB: bl __sync_val_compare_and_swap_1
	; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
	; CHECK-THUMB: movs [[R1:r[0-7]]], r0
	; CHECK-THUMB: movs r0, #1
	; CHECK-THUMB: movs [[R2:r[0-9]+]], #0
	; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}}
	; CHECK-THUMB: beq
	; CHECK-THUMB: movs r0, [[R2]]

	; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8:
	; CHECK-ARMV6-NEXT: .fnstart
	; CHECK-ARMV6-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-ARMV6-NEXT: [[TRY:.LBB[0-9_]+]]:
	; CHECK-ARMV6-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
	; CHECK-ARMV6-NEXT: cmp [[LD]], [[DESIRED]]
	; CHECK-ARMV6-NEXT: movne [[RES:r[0-9]+]], #0
	; CHECK-ARMV6-NEXT: bxne lr
	; CHECK-ARMV6-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	; CHECK-ARMV6-NEXT: cmp [[SUCCESS]], #0
	; CHECK-ARMV6-NEXT: moveq [[RES]], #1
	; CHECK-ARMV6-NEXT: bxeq lr
	; CHECK-ARMV6-NEXT: b [[TRY]]

	; CHECK-THUMBV6-LABEL: test_cmpxchg_res_i8:
	; CHECK-THUMBV6: mov [[EXPECTED:r[0-9]+]], r1
	; CHECK-THUMBV6-NEXT: bl __sync_val_compare_and_swap_1
	; CHECK-THUMBV6-NEXT: mov [[RES:r[0-9]+]], r0
	+; CHECK-THUMBV6-NEXT: uxtb [[EXPECTED_ZEXT:r[0-9]+]], [[EXPECTED]]
	; CHECK-THUMBV6-NEXT: movs r0, #1
	; CHECK-THUMBV6-NEXT: movs [[ZERO:r[0-9]+]], #0
	-; CHECK-THUMBV6-NEXT: cmp [[RES]], [[EXPECTED]]
	+; CHECK-THUMBV6-NEXT: cmp [[RES]], [[EXPECTED_ZEXT]]
	; CHECK-THUMBV6-NEXT: beq [[END:.LBB[0-9_]+]]
	; CHECK-THUMBV6-NEXT: mov r0, [[ZERO]]
	; CHECK-THUMBV6-NEXT: [[END]]:
	; CHECK-THUMBV6-NEXT: pop {{.*}}pc}

	; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
	; CHECK-ARMV7-NEXT: .fnstart
	; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-ARMV7-NEXT: b [[TRY:.LBB[0-9_]+]]
	; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
	; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
	; CHECK-ARMV7-NEXT: moveq r0, #1
	; CHECK-ARMV7-NEXT: bxeq lr
	; CHECK-ARMV7-NEXT: [[TRY]]:
	; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
	; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
	; CHECK-ARMV7-NEXT: beq [[HEAD]]
	; CHECK-ARMV7-NEXT: mov r0, #0
	; CHECK-ARMV7-NEXT: clrex
	; CHECK-ARMV7-NEXT: bx lr

	; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
	; CHECK-THUMBV7-NEXT: .fnstart
	; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]
	; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:
	; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	; CHECK-THUMBV7-NEXT: cmp [[SUCCESS]], #0
	; CHECK-THUMBV7-NEXT: itt eq
	; CHECK-THUMBV7-NEXT: moveq r0, #1
	; CHECK-THUMBV7-NEXT: bxeq lr
	; CHECK-THUMBV7-NEXT: [[TRYLD]]:
	; CHECK-THUMBV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
	; CHECK-THUMBV7-NEXT: cmp [[LD]], [[DESIRED]]
	; CHECK-THUMBV7-NEXT: beq [[TRYST:.LBB[0-9_]+]]
	; CHECK-THUMBV7-NEXT: movs r0, #0
	; CHECK-THUMBV7-NEXT: clrex
	; CHECK-THUMBV7-NEXT: bx lr
	Index: vendor/llvm/dist-release_60/test/CodeGen/ARM/cmpxchg-O0.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/ARM/cmpxchg-O0.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/ARM/cmpxchg-O0.ll (revision 328362)
	@@ -1,113 +1,115 @@
	; RUN: llc -verify-machineinstrs -mtriple=armv7-linux-gnu -O0 %s -o - \| FileCheck %s
	; RUN: llc -verify-machineinstrs -mtriple=thumbv8-linux-gnu -O0 %s -o - \| FileCheck %s
	; RUN: llc -verify-machineinstrs -mtriple=thumbv6m-none-eabi -O0 %s -o - \| FileCheck %s --check-prefix=CHECK-T1

	; CHECK-T1-NOT: ldrex
	; CHECK-T1-NOT: strex

	define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_8:
	; CHECK: dmb ish
	; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]]
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrexb [[OLD:r[0-9]+]], [r0]
	; CHECK: cmp [[OLD]], [[DESIRED]]
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	-; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
	+; CHECK: uxtb [[DESIRED_ZEXT:r[0-9]+]], [[DESIRED]]
	+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED_ZEXT]]
	; CHECK: {{moveq\|movweq}} {{r[0-9]+}}, #1
	; CHECK: dmb ish
	%res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic
	ret { i8, i1 } %res
	}

	define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_16:
	; CHECK: dmb ish
	; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]]
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrexh [[OLD:r[0-9]+]], [r0]
	; CHECK: cmp [[OLD]], [[DESIRED]]
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	-; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
	+; CHECK: uxth [[DESIRED_ZEXT:r[0-9]+]], [[DESIRED]]
	+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED_ZEXT]]
	; CHECK: {{moveq\|movweq}} {{r[0-9]+}}, #1
	; CHECK: dmb ish
	%res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic
	ret { i16, i1 } %res
	}

	define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_32:
	; CHECK: dmb ish
	; CHECK-NOT: uxt
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrex [[OLD:r[0-9]+]], [r0]
	; CHECK: cmp [[OLD]], [[DESIRED]]
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
	; CHECK: {{moveq\|movweq}} {{r[0-9]+}}, #1
	; CHECK: dmb ish
	%res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
	ret { i32, i1 } %res
	}

	define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_64:
	; CHECK: dmb ish
	; CHECK-NOT: uxt
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0]
	; CHECK: cmp [[OLDLO]], r6
	; CHECK: cmpeq [[OLDHI]], r7
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strexd [[STATUS:r[0-9]+]], r4, r5, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: dmb ish
	%res = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst monotonic
	ret { i64, i1 } %res
	}

	define { i64, i1 } @test_nontrivial_args(i64* %addr, i64 %desired, i64 %new) {
	; CHECK-LABEL: test_nontrivial_args:
	; CHECK: dmb ish
	; CHECK-NOT: uxt
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0]
	; CHECK: cmp [[OLDLO]], {{r[0-9]+}}
	; CHECK: cmpeq [[OLDHI]], {{r[0-9]+}}
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: dmb ish

	%desired1 = add i64 %desired, 1
	%new1 = add i64 %new, 1
	%res = cmpxchg i64* %addr, i64 %desired1, i64 %new1 seq_cst seq_cst
	ret { i64, i1 } %res
	}

	; The following used to trigger an assertion when creating a spill on thumb2
	; for a physreg with RC==GPRPairRegClass.
	; CHECK-LABEL: test_cmpxchg_spillbug:
	; CHECK: ldrexd
	; CHECK: strexd
	; CHECK: bne
	define void @test_cmpxchg_spillbug() {
	%v = cmpxchg i64* undef, i64 undef, i64 undef seq_cst seq_cst
	ret void
	}
	Index: vendor/llvm/dist-release_60/test/CodeGen/ARM/global-merge-dllexport.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/ARM/global-merge-dllexport.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/ARM/global-merge-dllexport.ll (revision 328362)
	@@ -0,0 +1,15 @@
	+; RUN: llc < %s -mtriple=thumbv7-win32 -arm-global-merge \| FileCheck %s
	+
	+@x = global i32 0, align 4
	+@y = dllexport global i32 0, align 4
	+
	+define void @f1(i32 %a1, i32 %a2) {
	+; CHECK: f1:
	+; CHECK: movw [[REG1:r[0-9]+]], :lower16:x
	+; CHECK: movt [[REG1]], :upper16:x
	+ store i32 %a1, i32* @x, align 4
	+ store i32 %a2, i32* @y, align 4
	+ ret void
	+}
	+
	+; CHECK-NOT: .L_MergedGlobals
	Index: vendor/llvm/dist-release_60/test/CodeGen/ARM/global-merge-external.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/ARM/global-merge-external.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/ARM/global-merge-external.ll (revision 328362)
	@@ -1,47 +1,62 @@
	-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge \| FileCheck %s --check-prefix=CHECK-MERGE
	-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=true \| FileCheck %s --check-prefix=CHECK-MERGE
	-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=false \| FileCheck %s --check-prefix=CHECK-NO-MERGE
	-; RUN: llc < %s -mtriple=arm-macho -arm-global-merge \| FileCheck %s --check-prefix=CHECK-NO-MERGE
	-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -relocation-model=pic \| FileCheck %s --check-prefix=CHECK-NO-MERGE
	+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge \| FileCheck %s --check-prefixes=CHECK,CHECK-MERGE
	+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=true \| FileCheck %s --check-prefixes=CHECK,CHECK-MERGE
	+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=false \| FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
	+; RUN: llc < %s -mtriple=arm-macho -arm-global-merge \| FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
	+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -relocation-model=pic \| FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
	+; RUN: llc < %s -mtriple=thumbv7-win32 -arm-global-merge \| FileCheck %s --check-prefixes=CHECK-WIN32

	@x = global i32 0, align 4
	@y = global i32 0, align 4
	@z = global i32 0, align 4

	define void @f1(i32 %a1, i32 %a2) {
	;CHECK: f1:
	-;CHECK: ldr {{r[0-9]+}}, [[LABEL1:\.LCPI[0-9]+_[0-9]]]
	+;CHECK: ldr {{r[0-9]+}}, [[LABEL1:\.?LCPI[0-9]+_[0-9]]]
	;CHECK: [[LABEL1]]:
	;CHECK-MERGE: .long .L_MergedGlobals
	;CHECK-NO-MERGE: .long {{_?x}}
	+;CHECK-WIN32: f1:
	+;CHECK-WIN32: movw [[REG1:r[0-9]+]], :lower16:.L_MergedGlobals
	+;CHECK-WIN32: movt [[REG1]], :upper16:.L_MergedGlobals
	store i32 %a1, i32* @x, align 4
	store i32 %a2, i32* @y, align 4
	ret void
	}

	define void @g1(i32 %a1, i32 %a2) {
	;CHECK: g1:
	-;CHECK: ldr {{r[0-9]+}}, [[LABEL2:\.LCPI[0-9]+_[0-9]]]
	+;CHECK: ldr {{r[0-9]+}}, [[LABEL2:\.?LCPI[0-9]+_[0-9]]]
	;CHECK: [[LABEL2]]:
	;CHECK-MERGE: .long .L_MergedGlobals
	;CHECK-NO-MERGE: .long {{_?y}}
	+;CHECK-WIN32: g1:
	+;CHECK-WIN32: movw [[REG2:r[0-9]+]], :lower16:.L_MergedGlobals
	+;CHECK-WIN32: movt [[REG2]], :upper16:.L_MergedGlobals
	store i32 %a1, i32* @y, align 4
	store i32 %a2, i32* @z, align 4
	ret void
	}

	;CHECK-NO-MERGE-NOT: .globl .L_MergedGlobals

	;CHECK-MERGE: .type .L_MergedGlobals,%object
	;CHECK-MERGE: .local .L_MergedGlobals
	;CHECK-MERGE: .comm .L_MergedGlobals,12,4
	+;CHECK-WIN32: .lcomm .L_MergedGlobals,12,4

	;CHECK-MERGE: .globl x
	;CHECK-MERGE: x = .L_MergedGlobals
	;CHECK-MERGE: .size x, 4
	;CHECK-MERGE: .globl y
	;CHECK-MERGE: y = .L_MergedGlobals+4
	;CHECK-MERGE: .size y, 4
	;CHECK-MERGE: .globl z
	;CHECK-MERGE: z = .L_MergedGlobals+8
	;CHECK-MERGE: .size z, 4
	+
	+;CHECK-WIN32: .globl x
	+;CHECK-WIN32: x = .L_MergedGlobals
	+;CHECK-WIN32: .globl y
	+;CHECK-WIN32: y = .L_MergedGlobals+4
	+;CHECK-WIN32: .globl z
	+;CHECK-WIN32: z = .L_MergedGlobals+8
	Index: vendor/llvm/dist-release_60/test/CodeGen/ARM/peephole-phi.mir
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/ARM/peephole-phi.mir (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/ARM/peephole-phi.mir (revision 328362)
	@@ -0,0 +1,67 @@
	+# RUN: llc -o - %s -mtriple=armv7-- -verify-machineinstrs -run-pass=peephole-opt \| FileCheck %s
	+#
	+# Make sure we do not crash on this input.
	+# Note that this input could in principle be optimized, but right now we don't
	+# have this case implemented so the output should simply be unchanged.
	+#
	+# CHECK-LABEL: name: func
	+# CHECK: body: \|
	+# CHECK: bb.0:
	+# CHECK: Bcc %bb.2, 1, undef %cpsr
	+#
	+# CHECK: bb.1:
	+# CHECK: %0:dpr = IMPLICIT_DEF
	+# CHECK: %1:gpr, %2:gpr = VMOVRRD %0, 14, %noreg
	+# CHECK: B %bb.3
	+#
	+# CHECK: bb.2:
	+# CHECK: %3:spr = IMPLICIT_DEF
	+# CHECK: %4:gpr = VMOVRS %3, 14, %noreg
	+#
	+# CHECK: bb.3:
	+# CHECK: %5:gpr = PHI %1, %bb.1, %4, %bb.2
	+# CHECK: %6:spr = VMOVSR %5, 14, %noreg
	+---
	+name: func0
	+tracksRegLiveness: true
	+body: \|
	+ bb.0:
	+ Bcc %bb.2, 1, undef %cpsr
	+
	+ bb.1:
	+ %0:dpr = IMPLICIT_DEF
	+ %1:gpr, %2:gpr = VMOVRRD %0:dpr, 14, %noreg
	+ B %bb.3
	+
	+ bb.2:
	+ %3:spr = IMPLICIT_DEF
	+ %4:gpr = VMOVRS %3:spr, 14, %noreg
	+
	+ bb.3:
	+ %5:gpr = PHI %1, %bb.1, %4, %bb.2
	+ %6:spr = VMOVSR %5, 14, %noreg
	+...
	+
	+# CHECK-LABEL: name: func1
	+# CHECK: %6:spr = PHI %0, %bb.1, %2, %bb.2
	+# CHEKC: %7:spr = COPY %6
	+---
	+name: func1
	+tracksRegLiveness: true
	+body: \|
	+ bb.0:
	+ Bcc %bb.2, 1, undef %cpsr
	+
	+ bb.1:
	+ %1:spr = IMPLICIT_DEF
	+ %0:gpr = VMOVRS %1, 14, %noreg
	+ B %bb.3
	+
	+ bb.2:
	+ %3:spr = IMPLICIT_DEF
	+ %2:gpr = VMOVRS %3:spr, 14, %noreg
	+
	+ bb.3:
	+ %4:gpr = PHI %0, %bb.1, %2, %bb.2
	+ %5:spr = VMOVSR %4, 14, %noreg
	+...
	Index: vendor/llvm/dist-release_60/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll (revision 328362)
	@@ -0,0 +1,94 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; Make sure that a negative value for the compare-and-swap is zero extended
	+; from i8/i16 to i32 since it will be compared for equality.
	+; RUN: llc -mtriple=powerpc64le-linux-gnu -verify-machineinstrs < %s \| FileCheck %s
	+; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr7 < %s \| FileCheck %s --check-prefix=CHECK-P7
	+
	+@str = private unnamed_addr constant [46 x i8] c"FAILED: __atomic_compare_exchange_n() failed.\00"
	+@str.1 = private unnamed_addr constant [59 x i8] c"FAILED: __atomic_compare_exchange_n() set the wrong value.\00"
	+@str.2 = private unnamed_addr constant [7 x i8] c"PASSED\00"
	+
	+define signext i32 @main() {
	+; CHECK-LABEL: main:
	+; CHECK: li 3, -32477
	+; CHECK: lis 12, 0
	+; CHECK: li 6, 234
	+; CHECK: sth 3, 46(1)
	+; CHECK: ori 4, 12, 33059
	+; CHECK: sync
	+; CHECK: .LBB0_1: # %L.entry
	+; CHECK: lharx 3, 0, 5
	+; CHECK: cmpw 4, 3
	+; CHECK: bne 0, .LBB0_3
	+; CHECK: sthcx. 6, 0, 5
	+; CHECK: bne 0, .LBB0_1
	+; CHECK: b .LBB0_4
	+; CHECK: .LBB0_3: # %L.entry
	+; CHECK: sthcx. 3, 0, 5
	+; CHECK: .LBB0_4: # %L.entry
	+; CHECK: cmplwi 3, 33059
	+; CHECK: lwsync
	+; CHECK: lhz 3, 46(1)
	+; CHECK: cmplwi 3, 234
	+;
	+; CHECK-P7-LABEL: main:
	+; CHECK-P7: lis 4, 0
	+; CHECK-P7: li 7, 0
	+; CHECK-P7: li 3, -32477
	+; CHECK-P7: sth 3, 46(1)
	+; CHECK-P7: li 5, 234
	+; CHECK-P7: ori 4, 4, 33059
	+; CHECK-P7: rlwinm 3, 6, 3, 27, 27
	+; CHECK-P7: ori 7, 7, 65535
	+; CHECK-P7: sync
	+; CHECK-P7: slw 8, 5, 3
	+; CHECK-P7: slw 5, 7, 3
	+; CHECK-P7: slw 9, 4, 3
	+; CHECK-P7: and 7, 8, 5
	+; CHECK-P7: rldicr 4, 6, 0, 61
	+; CHECK-P7: and 8, 9, 5
	+; CHECK-P7: .LBB0_1: # %L.entry
	+; CHECK-P7: lwarx 9, 0, 4
	+; CHECK-P7: and 6, 9, 5
	+; CHECK-P7: cmpw 0, 6, 8
	+; CHECK-P7: bne 0, .LBB0_3
	+; CHECK-P7: andc 9, 9, 5
	+; CHECK-P7: or 9, 9, 7
	+; CHECK-P7: stwcx. 9, 0, 4
	+; CHECK-P7: bne 0, .LBB0_1
	+; CHECK-P7: b .LBB0_4
	+; CHECK-P7: .LBB0_3: # %L.entry
	+; CHECK-P7: stwcx. 9, 0, 4
	+; CHECK-P7: .LBB0_4: # %L.entry
	+; CHECK-P7: srw 3, 6, 3
	+; CHECK-P7: lwsync
	+; CHECK-P7: cmplwi 3, 33059
	+; CHECK-P7: lhz 3, 46(1)
	+; CHECK-P7: cmplwi 3, 234
	+L.entry:
	+ %value.addr = alloca i16, align 2
	+ store i16 -32477, i16* %value.addr, align 2
	+ %0 = cmpxchg i16* %value.addr, i16 -32477, i16 234 seq_cst seq_cst
	+ %1 = extractvalue { i16, i1 } %0, 1
	+ br i1 %1, label %L.B0000, label %L.B0003
	+
	+L.B0003: ; preds = %L.entry
	+ %puts = call i32 @puts(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @str, i64 0, i64 0))
	+ ret i32 1
	+
	+L.B0000: ; preds = %L.entry
	+ %2 = load i16, i16* %value.addr, align 2
	+ %3 = icmp eq i16 %2, 234
	+ br i1 %3, label %L.B0001, label %L.B0005
	+
	+L.B0005: ; preds = %L.B0000
	+ %puts1 = call i32 @puts(i8* getelementptr inbounds ([59 x i8], [59 x i8]* @str.1, i64 0, i64 0))
	+ ret i32 1
	+
	+L.B0001: ; preds = %L.B0000
	+ %puts2 = call i32 @puts(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @str.2, i64 0, i64 0))
	+ ret i32 0
	+}
	+
	+; Function Attrs: nounwind
	+declare i32 @puts(i8* nocapture readonly) #0
	Index: vendor/llvm/dist-release_60/test/CodeGen/PowerPC/atomics-regression.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/PowerPC/atomics-regression.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/PowerPC/atomics-regression.ll (revision 328362)
	@@ -1,9570 +1,9610 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=powerpc64le-linux-gnu < %s \| FileCheck %s -check-prefix=PPC64LE

	define i8 @test0(i8* %ptr) {
	; PPC64LE-LABEL: test0:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lbz 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i8, i8* %ptr unordered, align 1
	ret i8 %val
	}

	define i8 @test1(i8* %ptr) {
	; PPC64LE-LABEL: test1:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lbz 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i8, i8* %ptr monotonic, align 1
	ret i8 %val
	}

	define i8 @test2(i8* %ptr) {
	; PPC64LE-LABEL: test2:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lbz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i8, i8* %ptr acquire, align 1
	ret i8 %val
	}

	define i8 @test3(i8* %ptr) {
	; PPC64LE-LABEL: test3:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: ori 2, 2, 0
	; PPC64LE-NEXT: lbz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i8, i8* %ptr seq_cst, align 1
	ret i8 %val
	}

	define i16 @test4(i16* %ptr) {
	; PPC64LE-LABEL: test4:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lhz 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i16, i16* %ptr unordered, align 2
	ret i16 %val
	}

	define i16 @test5(i16* %ptr) {
	; PPC64LE-LABEL: test5:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lhz 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i16, i16* %ptr monotonic, align 2
	ret i16 %val
	}

	define i16 @test6(i16* %ptr) {
	; PPC64LE-LABEL: test6:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lhz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i16, i16* %ptr acquire, align 2
	ret i16 %val
	}

	define i16 @test7(i16* %ptr) {
	; PPC64LE-LABEL: test7:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: ori 2, 2, 0
	; PPC64LE-NEXT: lhz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i16, i16* %ptr seq_cst, align 2
	ret i16 %val
	}

	define i32 @test8(i32* %ptr) {
	; PPC64LE-LABEL: test8:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwz 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i32, i32* %ptr unordered, align 4
	ret i32 %val
	}

	define i32 @test9(i32* %ptr) {
	; PPC64LE-LABEL: test9:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwz 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i32, i32* %ptr monotonic, align 4
	ret i32 %val
	}

	define i32 @test10(i32* %ptr) {
	; PPC64LE-LABEL: test10:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i32, i32* %ptr acquire, align 4
	ret i32 %val
	}

	define i32 @test11(i32* %ptr) {
	; PPC64LE-LABEL: test11:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: ori 2, 2, 0
	; PPC64LE-NEXT: lwz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i32, i32* %ptr seq_cst, align 4
	ret i32 %val
	}

	define i64 @test12(i64* %ptr) {
	; PPC64LE-LABEL: test12:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: ld 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i64, i64* %ptr unordered, align 8
	ret i64 %val
	}

	define i64 @test13(i64* %ptr) {
	; PPC64LE-LABEL: test13:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: ld 3, 0(3)
	; PPC64LE-NEXT: blr
	%val = load atomic i64, i64* %ptr monotonic, align 8
	ret i64 %val
	}

	define i64 @test14(i64* %ptr) {
	; PPC64LE-LABEL: test14:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: ld 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i64, i64* %ptr acquire, align 8
	ret i64 %val
	}

	define i64 @test15(i64* %ptr) {
	; PPC64LE-LABEL: test15:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: ori 2, 2, 0
	; PPC64LE-NEXT: ld 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: blr
	%val = load atomic i64, i64* %ptr seq_cst, align 8
	ret i64 %val
	}

	define void @test16(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test16:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: stb 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i8 %val, i8* %ptr unordered, align 1
	ret void
	}

	define void @test17(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test17:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: stb 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i8 %val, i8* %ptr monotonic, align 1
	ret void
	}

	define void @test18(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test18:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: stb 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i8 %val, i8* %ptr release, align 1
	ret void
	}

	define void @test19(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test19:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: stb 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i8 %val, i8* %ptr seq_cst, align 1
	ret void
	}

	define void @test20(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test20:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sth 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i16 %val, i16* %ptr unordered, align 2
	ret void
	}

	define void @test21(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test21:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sth 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i16 %val, i16* %ptr monotonic, align 2
	ret void
	}

	define void @test22(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test22:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: sth 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i16 %val, i16* %ptr release, align 2
	ret void
	}

	define void @test23(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test23:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: sth 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i16 %val, i16* %ptr seq_cst, align 2
	ret void
	}

	define void @test24(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test24:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: stw 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i32 %val, i32* %ptr unordered, align 4
	ret void
	}

	define void @test25(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test25:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: stw 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i32 %val, i32* %ptr monotonic, align 4
	ret void
	}

	define void @test26(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test26:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: stw 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i32 %val, i32* %ptr release, align 4
	ret void
	}

	define void @test27(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test27:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: stw 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i32 %val, i32* %ptr seq_cst, align 4
	ret void
	}

	define void @test28(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test28:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: std 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i64 %val, i64* %ptr unordered, align 8
	ret void
	}

	define void @test29(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test29:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: std 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i64 %val, i64* %ptr monotonic, align 8
	ret void
	}

	define void @test30(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test30:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: std 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i64 %val, i64* %ptr release, align 8
	ret void
	}

	define void @test31(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test31:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: std 4, 0(3)
	; PPC64LE-NEXT: blr
	store atomic i64 %val, i64* %ptr seq_cst, align 8
	ret void
	}

	define void @test32() {
	; PPC64LE-LABEL: test32:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	fence acquire
	ret void
	}

	define void @test33() {
	; PPC64LE-LABEL: test33:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	fence release
	ret void
	}

	define void @test34() {
	; PPC64LE-LABEL: test34:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	fence acq_rel
	ret void
	}

	define void @test35() {
	; PPC64LE-LABEL: test35:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: blr
	fence seq_cst
	ret void
	}

	define void @test36() {
	; PPC64LE-LABEL: test36:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	fence syncscope("singlethread") acquire
	ret void
	}

	define void @test37() {
	; PPC64LE-LABEL: test37:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	fence syncscope("singlethread") release
	ret void
	}

	define void @test38() {
	; PPC64LE-LABEL: test38:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	fence syncscope("singlethread") acq_rel
	ret void
	}

	define void @test39() {
	; PPC64LE-LABEL: test39:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: blr
	fence syncscope("singlethread") seq_cst
	ret void
	}

	define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test40:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: b .LBB40_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB40_1:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB40_2:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB40_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val monotonic monotonic
	ret void
	}

	define void @test41(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test41:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: .LBB41_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB41_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB41_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB41_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acquire monotonic
	ret void
	}

	define void @test42(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test42:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: .LBB42_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB42_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB42_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB42_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acquire acquire
	ret void
	}

	define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test43:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB43_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB43_1:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB43_2:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB43_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release monotonic
	ret void
	}

	define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test44:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB44_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB44_1:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB44_2:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB44_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release acquire
	ret void
	}

	define void @test45(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test45:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB45_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB45_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB45_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB45_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acq_rel monotonic
	ret void
	}

	define void @test46(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test46:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB46_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB46_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB46_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB46_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acq_rel acquire
	ret void
	}

	define void @test47(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test47:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB47_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB47_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB47_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB47_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst monotonic
	ret void
	}

	define void @test48(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test48:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB48_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB48_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB48_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB48_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst acquire
	ret void
	}

	define void @test49(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test49:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB49_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB49_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB49_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB49_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst seq_cst
	ret void
	}

	define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test50:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: b .LBB50_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB50_1:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB50_2:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB50_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val monotonic monotonic
	ret void
	}

	define void @test51(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test51:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: .LBB51_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB51_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB51_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB51_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acquire monotonic
	ret void
	}

	define void @test52(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test52:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: .LBB52_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB52_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB52_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB52_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acquire acquire
	ret void
	}

	define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test53:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB53_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB53_1:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB53_2:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB53_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release monotonic
	ret void
	}

	define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test54:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB54_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB54_1:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB54_2:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB54_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release acquire
	ret void
	}

	define void @test55(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test55:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB55_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB55_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB55_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB55_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acq_rel monotonic
	ret void
	}

	define void @test56(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test56:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB56_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB56_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB56_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB56_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acq_rel acquire
	ret void
	}

	define void @test57(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test57:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB57_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB57_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB57_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB57_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst monotonic
	ret void
	}

	define void @test58(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test58:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB58_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB58_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB58_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB58_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst acquire
	ret void
	}

	define void @test59(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test59:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB59_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB59_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB59_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB59_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst seq_cst
	ret void
	}

	define void @test60(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test60:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: b .LBB60_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB60_1:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB60_2:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB60_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val monotonic monotonic
	ret void
	}

	define void @test61(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test61:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB61_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB61_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB61_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB61_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acquire monotonic
	ret void
	}

	define void @test62(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test62:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB62_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB62_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB62_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB62_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acquire acquire
	ret void
	}

	define void @test63(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test63:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB63_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB63_1:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB63_2:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB63_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release monotonic
	ret void
	}

	define void @test64(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test64:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB64_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB64_1:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB64_2:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB64_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release acquire
	ret void
	}

	define void @test65(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test65:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB65_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB65_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB65_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB65_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acq_rel monotonic
	ret void
	}

	define void @test66(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test66:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB66_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB66_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB66_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB66_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acq_rel acquire
	ret void
	}

	define void @test67(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test67:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB67_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB67_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB67_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB67_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst monotonic
	ret void
	}

	define void @test68(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test68:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB68_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB68_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB68_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB68_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst acquire
	ret void
	}

	define void @test69(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test69:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB69_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB69_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB69_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB69_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst seq_cst
	ret void
	}

	define void @test70(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test70:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: b .LBB70_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB70_1:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB70_2:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: beq 0, .LBB70_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val monotonic monotonic
	ret void
	}

	define void @test71(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test71:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB71_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB71_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB71_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB71_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acquire monotonic
	ret void
	}

	define void @test72(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test72:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB72_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB72_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB72_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB72_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acquire acquire
	ret void
	}

	define void @test73(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test73:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB73_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB73_1:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB73_2:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: beq 0, .LBB73_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release monotonic
	ret void
	}

	define void @test74(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test74:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB74_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB74_1:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB74_2:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: beq 0, .LBB74_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release acquire
	ret void
	}

	define void @test75(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test75:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB75_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB75_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB75_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB75_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acq_rel monotonic
	ret void
	}

	define void @test76(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test76:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB76_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB76_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB76_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB76_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acq_rel acquire
	ret void
	}

	define void @test77(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test77:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB77_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB77_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB77_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB77_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst monotonic
	ret void
	}

	define void @test78(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test78:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB78_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB78_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB78_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB78_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst acquire
	ret void
	}

	define void @test79(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test79:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB79_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB79_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB79_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB79_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst seq_cst
	ret void
	}

	define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test80:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: b .LBB80_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB80_1:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB80_2:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB80_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic
	ret void
	}

	define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test81:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: .LBB81_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB81_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB81_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB81_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire monotonic
	ret void
	}

	define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test82:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: .LBB82_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB82_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB82_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB82_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire acquire
	ret void
	}

	define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test83:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB83_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB83_1:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB83_2:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB83_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic
	ret void
	}

	define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test84:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB84_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB84_1:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB84_2:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB84_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
	ret void
	}

	define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test85:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB85_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB85_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB85_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB85_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel monotonic
	ret void
	}

	define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test86:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB86_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB86_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB86_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB86_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel acquire
	ret void
	}

	define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test87:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB87_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB87_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB87_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB87_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst monotonic
	ret void
	}

	define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test88:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB88_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB88_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB88_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB88_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst acquire
	ret void
	}

	define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
	; PPC64LE-LABEL: test89:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB89_1:
	; PPC64LE-NEXT: lbarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB89_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB89_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB89_4:
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst seq_cst
	ret void
	}

	define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test90:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: b .LBB90_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB90_1:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB90_2:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB90_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic
	ret void
	}

	define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test91:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: .LBB91_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB91_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB91_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB91_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire monotonic
	ret void
	}

	define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test92:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: .LBB92_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB92_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB92_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB92_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire acquire
	ret void
	}

	define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test93:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB93_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB93_1:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB93_2:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB93_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic
	ret void
	}

	define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test94:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB94_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB94_1:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB94_2:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB94_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
	ret void
	}

	define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test95:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB95_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB95_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB95_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB95_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel monotonic
	ret void
	}

	define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test96:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB96_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB96_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB96_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB96_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel acquire
	ret void
	}

	define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test97:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB97_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB97_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB97_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB97_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst monotonic
	ret void
	}

	define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test98:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB98_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB98_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB98_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB98_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst acquire
	ret void
	}

	define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
	; PPC64LE-LABEL: test99:
	; PPC64LE: # %bb.0:
	+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB99_1:
	; PPC64LE-NEXT: lharx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB99_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB99_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB99_4:
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst seq_cst
	ret void
	}

	define void @test100(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test100:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: b .LBB100_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB100_1:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB100_2:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB100_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic
	ret void
	}

	define void @test101(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test101:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB101_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB101_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB101_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB101_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire monotonic
	ret void
	}

	define void @test102(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test102:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB102_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB102_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB102_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB102_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire acquire
	ret void
	}

	define void @test103(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test103:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB103_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB103_1:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB103_2:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB103_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic
	ret void
	}

	define void @test104(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test104:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB104_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB104_1:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB104_2:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: beq 0, .LBB104_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
	ret void
	}

	define void @test105(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test105:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB105_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB105_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB105_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB105_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel monotonic
	ret void
	}

	define void @test106(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test106:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB106_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB106_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB106_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB106_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel acquire
	ret void
	}

	define void @test107(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test107:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB107_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB107_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB107_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB107_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst monotonic
	ret void
	}

	define void @test108(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test108:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB108_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB108_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB108_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB108_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst acquire
	ret void
	}

	define void @test109(i32* %ptr, i32 %cmp, i32 %val) {
	; PPC64LE-LABEL: test109:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB109_1:
	; PPC64LE-NEXT: lwarx 6, 0, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bne 0, .LBB109_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB109_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB109_4:
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst seq_cst
	ret void
	}

	define void @test110(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test110:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: b .LBB110_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB110_1:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB110_2:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: beq 0, .LBB110_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic
	ret void
	}

	define void @test111(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test111:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB111_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB111_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB111_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB111_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire monotonic
	ret void
	}

	define void @test112(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test112:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB112_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB112_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB112_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB112_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire acquire
	ret void
	}

	define void @test113(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test113:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB113_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB113_1:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB113_2:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: beq 0, .LBB113_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic
	ret void
	}

	define void @test114(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test114:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: b .LBB114_2
	; PPC64LE-NEXT: .p2align 5
	; PPC64LE-NEXT: .LBB114_1:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: beqlr 0
	; PPC64LE-NEXT: .LBB114_2:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: beq 0, .LBB114_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire
	ret void
	}

	define void @test115(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test115:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB115_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB115_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB115_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB115_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel monotonic
	ret void
	}

	define void @test116(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test116:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB116_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB116_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB116_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB116_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel acquire
	ret void
	}

	define void @test117(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test117:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB117_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB117_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB117_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB117_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst monotonic
	ret void
	}

	define void @test118(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test118:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB118_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB118_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB118_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB118_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst acquire
	ret void
	}

	define void @test119(i64* %ptr, i64 %cmp, i64 %val) {
	; PPC64LE-LABEL: test119:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB119_1:
	; PPC64LE-NEXT: ldarx 6, 0, 3
	; PPC64LE-NEXT: cmpd 4, 6
	; PPC64LE-NEXT: bne 0, .LBB119_4
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 5, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB119_1
	; PPC64LE-NEXT: # %bb.3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	; PPC64LE-NEXT: .LBB119_4:
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst seq_cst
	ret void
	}

	define i8 @test120(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test120:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB120_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB120_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test121(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test121:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB121_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB121_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test122(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test122:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB122_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB122_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test123(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test123:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB123_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB123_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test124(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test124:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB124_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB124_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test125(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test125:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB125_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB125_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test126(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test126:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB126_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB126_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test127(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test127:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB127_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB127_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test128(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test128:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB128_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB128_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test129(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test129:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB129_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB129_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test130(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test130:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB130_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB130_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test131(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test131:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB131_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB131_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test132(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test132:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB132_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB132_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test133(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test133:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB133_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB133_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test134(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test134:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB134_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB134_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test135(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test135:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB135_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB135_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test136(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test136:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB136_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB136_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test137(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test137:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB137_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB137_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test138(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test138:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB138_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB138_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test139(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test139:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB139_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB139_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test140(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test140:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB140_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB140_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test141(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test141:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB141_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB141_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test142(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test142:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB142_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB142_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test143(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test143:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB143_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB143_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test144(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test144:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB144_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB144_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test145(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test145:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB145_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB145_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test146(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test146:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB146_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB146_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test147(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test147:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB147_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB147_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test148(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test148:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB148_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB148_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test149(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test149:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB149_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB149_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test150(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test150:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB150_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB150_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test151(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test151:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB151_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB151_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test152(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test152:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB152_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB152_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test153(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test153:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB153_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB153_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test154(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test154:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB154_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB154_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test155(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test155:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB155_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB155_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test156(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test156:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB156_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB156_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test157(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test157:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB157_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB157_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test158(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test158:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB158_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB158_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test159(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test159:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB159_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB159_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test160(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test160:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB160_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB160_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test161(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test161:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB161_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: subf 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB161_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test162(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test162:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB162_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB162_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test163(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test163:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB163_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB163_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test164(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test164:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB164_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB164_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test165(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test165:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB165_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB165_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test166(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test166:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB166_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: subf 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB166_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test167(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test167:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB167_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB167_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test168(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test168:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB168_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB168_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test169(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test169:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB169_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB169_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test170(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test170:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB170_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB170_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test171(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test171:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB171_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: subf 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB171_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test172(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test172:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB172_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB172_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test173(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test173:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB173_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB173_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test174(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test174:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB174_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB174_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test175(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test175:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB175_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB175_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test176(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test176:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB176_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: sub 6, 3, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB176_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test177(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test177:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB177_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB177_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test178(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test178:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB178_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB178_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test179(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test179:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB179_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB179_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test180(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test180:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB180_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB180_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test181(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test181:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB181_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB181_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test182(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test182:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB182_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB182_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test183(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test183:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB183_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB183_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test184(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test184:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB184_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB184_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test185(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test185:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB185_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB185_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test186(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test186:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB186_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB186_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test187(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test187:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB187_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB187_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test188(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test188:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB188_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB188_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test189(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test189:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB189_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB189_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test190(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test190:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB190_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB190_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test191(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test191:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB191_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB191_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test192(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test192:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB192_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB192_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test193(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test193:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB193_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB193_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test194(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test194:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB194_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB194_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test195(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test195:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB195_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB195_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test196(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test196:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB196_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB196_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test197(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test197:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB197_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB197_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test198(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test198:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB198_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB198_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test199(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test199:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB199_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB199_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test200(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test200:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB200_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB200_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test201(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test201:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB201_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB201_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test202(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test202:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB202_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB202_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test203(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test203:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB203_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB203_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test204(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test204:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB204_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB204_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test205(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test205:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB205_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB205_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test206(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test206:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB206_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB206_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test207(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test207:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB207_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB207_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test208(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test208:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB208_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB208_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test209(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test209:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB209_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB209_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test210(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test210:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB210_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB210_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test211(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test211:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB211_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB211_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test212(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test212:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB212_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB212_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test213(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test213:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB213_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB213_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test214(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test214:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB214_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB214_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test215(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test215:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB215_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB215_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test216(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test216:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB216_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB216_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test217(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test217:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB217_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB217_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test218(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test218:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB218_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB218_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test219(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test219:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB219_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB219_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test220(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test220:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB220_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB220_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test221(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test221:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB221_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB221_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test222(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test222:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB222_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB222_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test223(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test223:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB223_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB223_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test224(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test224:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB224_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB224_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test225(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test225:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB225_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB225_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test226(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test226:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB226_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB226_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test227(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test227:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB227_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB227_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test228(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test228:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB228_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB228_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test229(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test229:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB229_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB229_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test230(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test230:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB230_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB230_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test231(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test231:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB231_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB231_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test232(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test232:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB232_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB232_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test233(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test233:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB233_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB233_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test234(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test234:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB234_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB234_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test235(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test235:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB235_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB235_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test236(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test236:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB236_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB236_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test237(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test237:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB237_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB237_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test238(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test238:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB238_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB238_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test239(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test239:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB239_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB239_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test240(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test240:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB240_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB240_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test241(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test241:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB241_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB241_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test242(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test242:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB242_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB242_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test243(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test243:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB243_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB243_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test244(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test244:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB244_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB244_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test245(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test245:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB245_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB245_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test246(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test246:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB246_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB246_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test247(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test247:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB247_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB247_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test248(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test248:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB248_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB248_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test249(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test249:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB249_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB249_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test250(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test250:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB250_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB250_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test251(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test251:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB251_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB251_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test252(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test252:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB252_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB252_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test253(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test253:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB253_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB253_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test254(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test254:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB254_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB254_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test255(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test255:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB255_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB255_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test256(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test256:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB256_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB256_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test257(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test257:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB257_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB257_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test258(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test258:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB258_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB258_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test259(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test259:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB259_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB259_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test260(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test260:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB260_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB260_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB260_1
	; PPC64LE-NEXT: .LBB260_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test261(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test261:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB261_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: extsb 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB261_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB261_1
	; PPC64LE-NEXT: .LBB261_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test262(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test262:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB262_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB262_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB262_1
	; PPC64LE-NEXT: .LBB262_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test263(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test263:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB263_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB263_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB263_1
	; PPC64LE-NEXT: .LBB263_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test264(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test264:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB264_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB264_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB264_1
	; PPC64LE-NEXT: .LBB264_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test265(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test265:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB265_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB265_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB265_1
	; PPC64LE-NEXT: .LBB265_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test266(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test266:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB266_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: extsh 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB266_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB266_1
	; PPC64LE-NEXT: .LBB266_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test267(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test267:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB267_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB267_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB267_1
	; PPC64LE-NEXT: .LBB267_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test268(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test268:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB268_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB268_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB268_1
	; PPC64LE-NEXT: .LBB268_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test269(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test269:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB269_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB269_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB269_1
	; PPC64LE-NEXT: .LBB269_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test270(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test270:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB270_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB270_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB270_1
	; PPC64LE-NEXT: .LBB270_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test271(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test271:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB271_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmpw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB271_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB271_1
	; PPC64LE-NEXT: .LBB271_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test272(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test272:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB272_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB272_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB272_1
	; PPC64LE-NEXT: .LBB272_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test273(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test273:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB273_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB273_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB273_1
	; PPC64LE-NEXT: .LBB273_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test274(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test274:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB274_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB274_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB274_1
	; PPC64LE-NEXT: .LBB274_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test275(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test275:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB275_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB275_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB275_1
	; PPC64LE-NEXT: .LBB275_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test276(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test276:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB276_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpd 4, 3
	; PPC64LE-NEXT: ble 0, .LBB276_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB276_1
	; PPC64LE-NEXT: .LBB276_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test277(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test277:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB277_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB277_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB277_1
	; PPC64LE-NEXT: .LBB277_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test278(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test278:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB278_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB278_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB278_1
	; PPC64LE-NEXT: .LBB278_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test279(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test279:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB279_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB279_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB279_1
	; PPC64LE-NEXT: .LBB279_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test280(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test280:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB280_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB280_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB280_1
	; PPC64LE-NEXT: .LBB280_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test281(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test281:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB281_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: extsb 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB281_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB281_1
	; PPC64LE-NEXT: .LBB281_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test282(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test282:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB282_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB282_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB282_1
	; PPC64LE-NEXT: .LBB282_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test283(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test283:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB283_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB283_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB283_1
	; PPC64LE-NEXT: .LBB283_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test284(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test284:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB284_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB284_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB284_1
	; PPC64LE-NEXT: .LBB284_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test285(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test285:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB285_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB285_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB285_1
	; PPC64LE-NEXT: .LBB285_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test286(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test286:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB286_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: extsh 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB286_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB286_1
	; PPC64LE-NEXT: .LBB286_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test287(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test287:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB287_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB287_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB287_1
	; PPC64LE-NEXT: .LBB287_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test288(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test288:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB288_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB288_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB288_1
	; PPC64LE-NEXT: .LBB288_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test289(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test289:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB289_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB289_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB289_1
	; PPC64LE-NEXT: .LBB289_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test290(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test290:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB290_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB290_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB290_1
	; PPC64LE-NEXT: .LBB290_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test291(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test291:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB291_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmpw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB291_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB291_1
	; PPC64LE-NEXT: .LBB291_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test292(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test292:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB292_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB292_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB292_1
	; PPC64LE-NEXT: .LBB292_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test293(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test293:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB293_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB293_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB293_1
	; PPC64LE-NEXT: .LBB293_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test294(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test294:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB294_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB294_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB294_1
	; PPC64LE-NEXT: .LBB294_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test295(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test295:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB295_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB295_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB295_1
	; PPC64LE-NEXT: .LBB295_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test296(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test296:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB296_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpd 4, 3
	; PPC64LE-NEXT: bge 0, .LBB296_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB296_1
	; PPC64LE-NEXT: .LBB296_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test297(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test297:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB297_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB297_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB297_1
	; PPC64LE-NEXT: .LBB297_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test298(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test298:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB298_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB298_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB298_1
	; PPC64LE-NEXT: .LBB298_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test299(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test299:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB299_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB299_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB299_1
	; PPC64LE-NEXT: .LBB299_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test300(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test300:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB300_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB300_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB300_1
	; PPC64LE-NEXT: .LBB300_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test301(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test301:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB301_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB301_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB301_1
	; PPC64LE-NEXT: .LBB301_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test302(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test302:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB302_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB302_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB302_1
	; PPC64LE-NEXT: .LBB302_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test303(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test303:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB303_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB303_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB303_1
	; PPC64LE-NEXT: .LBB303_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test304(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test304:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB304_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB304_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB304_1
	; PPC64LE-NEXT: .LBB304_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test305(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test305:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB305_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB305_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB305_1
	; PPC64LE-NEXT: .LBB305_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test306(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test306:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB306_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB306_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB306_1
	; PPC64LE-NEXT: .LBB306_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test307(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test307:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB307_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB307_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB307_1
	; PPC64LE-NEXT: .LBB307_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test308(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test308:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB308_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB308_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB308_1
	; PPC64LE-NEXT: .LBB308_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test309(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test309:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB309_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB309_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB309_1
	; PPC64LE-NEXT: .LBB309_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test310(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test310:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB310_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB310_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB310_1
	; PPC64LE-NEXT: .LBB310_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test311(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test311:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB311_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB311_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB311_1
	; PPC64LE-NEXT: .LBB311_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test312(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test312:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB312_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB312_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB312_1
	; PPC64LE-NEXT: .LBB312_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test313(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test313:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB313_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB313_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB313_1
	; PPC64LE-NEXT: .LBB313_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test314(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test314:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB314_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB314_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB314_1
	; PPC64LE-NEXT: .LBB314_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test315(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test315:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB315_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB315_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB315_1
	; PPC64LE-NEXT: .LBB315_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test316(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test316:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB316_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpld 4, 3
	; PPC64LE-NEXT: ble 0, .LBB316_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB316_1
	; PPC64LE-NEXT: .LBB316_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test317(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test317:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB317_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB317_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB317_1
	; PPC64LE-NEXT: .LBB317_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test318(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test318:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB318_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB318_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB318_1
	; PPC64LE-NEXT: .LBB318_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test319(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test319:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB319_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB319_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB319_1
	; PPC64LE-NEXT: .LBB319_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test320(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test320:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB320_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB320_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB320_1
	; PPC64LE-NEXT: .LBB320_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val monotonic
	ret i8 %ret
	}

	define i8 @test321(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test321:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB321_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB321_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB321_1
	; PPC64LE-NEXT: .LBB321_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val acquire
	ret i8 %ret
	}

	define i8 @test322(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test322:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB322_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB322_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB322_1
	; PPC64LE-NEXT: .LBB322_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val release
	ret i8 %ret
	}

	define i8 @test323(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test323:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB323_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB323_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB323_1
	; PPC64LE-NEXT: .LBB323_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val acq_rel
	ret i8 %ret
	}

	define i8 @test324(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test324:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB324_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB324_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB324_1
	; PPC64LE-NEXT: .LBB324_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val seq_cst
	ret i8 %ret
	}

	define i16 @test325(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test325:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB325_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB325_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB325_1
	; PPC64LE-NEXT: .LBB325_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val monotonic
	ret i16 %ret
	}

	define i16 @test326(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test326:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB326_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB326_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB326_1
	; PPC64LE-NEXT: .LBB326_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val acquire
	ret i16 %ret
	}

	define i16 @test327(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test327:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB327_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB327_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB327_1
	; PPC64LE-NEXT: .LBB327_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val release
	ret i16 %ret
	}

	define i16 @test328(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test328:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB328_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB328_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB328_1
	; PPC64LE-NEXT: .LBB328_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val acq_rel
	ret i16 %ret
	}

	define i16 @test329(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test329:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB329_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB329_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB329_1
	; PPC64LE-NEXT: .LBB329_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val seq_cst
	ret i16 %ret
	}

	define i32 @test330(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test330:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB330_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB330_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB330_1
	; PPC64LE-NEXT: .LBB330_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val monotonic
	ret i32 %ret
	}

	define i32 @test331(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test331:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB331_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB331_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB331_1
	; PPC64LE-NEXT: .LBB331_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val acquire
	ret i32 %ret
	}

	define i32 @test332(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test332:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB332_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB332_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB332_1
	; PPC64LE-NEXT: .LBB332_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val release
	ret i32 %ret
	}

	define i32 @test333(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test333:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB333_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB333_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB333_1
	; PPC64LE-NEXT: .LBB333_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val acq_rel
	ret i32 %ret
	}

	define i32 @test334(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test334:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB334_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB334_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB334_1
	; PPC64LE-NEXT: .LBB334_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val seq_cst
	ret i32 %ret
	}

	define i64 @test335(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test335:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB335_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB335_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB335_1
	; PPC64LE-NEXT: .LBB335_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val monotonic
	ret i64 %ret
	}

	define i64 @test336(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test336:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB336_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpld 4, 3
	; PPC64LE-NEXT: bge 0, .LBB336_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB336_1
	; PPC64LE-NEXT: .LBB336_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val acquire
	ret i64 %ret
	}

	define i64 @test337(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test337:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB337_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB337_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB337_1
	; PPC64LE-NEXT: .LBB337_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val release
	ret i64 %ret
	}

	define i64 @test338(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test338:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB338_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB338_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB338_1
	; PPC64LE-NEXT: .LBB338_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val acq_rel
	ret i64 %ret
	}

	define i64 @test339(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test339:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB339_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB339_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB339_1
	; PPC64LE-NEXT: .LBB339_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val seq_cst
	ret i64 %ret
	}

	define i8 @test340(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test340:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB340_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB340_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test341(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test341:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB341_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB341_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test342(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test342:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB342_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB342_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test343(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test343:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB343_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB343_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test344(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test344:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB344_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB344_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test345(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test345:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB345_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB345_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test346(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test346:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB346_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB346_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test347(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test347:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB347_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB347_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test348(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test348:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB348_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB348_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test349(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test349:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB349_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB349_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test350(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test350:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB350_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB350_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test351(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test351:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB351_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB351_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test352(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test352:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB352_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB352_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test353(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test353:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB353_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB353_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test354(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test354:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB354_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB354_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test355(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test355:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB355_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB355_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test356(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test356:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB356_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB356_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test357(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test357:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB357_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB357_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test358(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test358:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB358_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB358_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test359(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test359:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB359_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB359_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test360(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test360:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB360_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB360_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test361(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test361:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB361_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB361_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test362(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test362:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB362_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB362_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test363(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test363:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB363_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB363_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test364(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test364:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB364_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB364_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test365(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test365:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB365_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB365_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test366(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test366:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB366_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB366_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test367(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test367:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB367_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB367_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test368(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test368:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB368_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB368_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test369(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test369:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB369_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB369_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test370(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test370:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB370_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB370_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test371(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test371:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB371_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB371_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test372(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test372:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB372_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB372_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test373(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test373:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB373_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB373_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test374(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test374:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB374_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB374_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test375(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test375:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB375_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB375_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test376(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test376:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB376_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: add 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB376_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test377(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test377:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB377_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB377_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test378(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test378:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB378_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB378_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test379(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test379:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB379_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: add 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB379_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test380(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test380:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB380_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB380_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test381(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test381:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB381_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: subf 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB381_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test382(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test382:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB382_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB382_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test383(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test383:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB383_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB383_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test384(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test384:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB384_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB384_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test385(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test385:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB385_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB385_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test386(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test386:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB386_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: subf 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB386_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test387(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test387:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB387_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB387_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test388(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test388:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB388_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB388_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test389(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test389:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB389_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB389_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test390(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test390:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB390_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB390_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test391(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test391:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB391_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: subf 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB391_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test392(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test392:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB392_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB392_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test393(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test393:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB393_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB393_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test394(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test394:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB394_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: subf 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB394_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test395(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test395:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB395_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB395_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test396(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test396:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB396_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: sub 6, 3, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB396_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test397(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test397:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB397_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB397_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test398(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test398:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB398_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB398_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test399(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test399:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB399_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: sub 6, 5, 4
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB399_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test400(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test400:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB400_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB400_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test401(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test401:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB401_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB401_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test402(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test402:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB402_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB402_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test403(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test403:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB403_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB403_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test404(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test404:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB404_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB404_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test405(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test405:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB405_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB405_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test406(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test406:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB406_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB406_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test407(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test407:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB407_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB407_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test408(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test408:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB408_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB408_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test409(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test409:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB409_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB409_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test410(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test410:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB410_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB410_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test411(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test411:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB411_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB411_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test412(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test412:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB412_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB412_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test413(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test413:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB413_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB413_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test414(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test414:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB414_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB414_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test415(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test415:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB415_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB415_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test416(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test416:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB416_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: and 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB416_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test417(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test417:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB417_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB417_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test418(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test418:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB418_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB418_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test419(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test419:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB419_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: and 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB419_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test420(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test420:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB420_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB420_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test421(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test421:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB421_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB421_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test422(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test422:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB422_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB422_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test423(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test423:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB423_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB423_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test424(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test424:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB424_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB424_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test425(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test425:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB425_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB425_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test426(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test426:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB426_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB426_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test427(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test427:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB427_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB427_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test428(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test428:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB428_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB428_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test429(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test429:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB429_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB429_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test430(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test430:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB430_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB430_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test431(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test431:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB431_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB431_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test432(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test432:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB432_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB432_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test433(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test433:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB433_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB433_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test434(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test434:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB434_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB434_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test435(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test435:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB435_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB435_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test436(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test436:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB436_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: nand 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB436_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test437(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test437:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB437_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB437_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test438(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test438:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB438_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB438_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test439(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test439:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB439_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: nand 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB439_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test440(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test440:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB440_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB440_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test441(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test441:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB441_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB441_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test442(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test442:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB442_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB442_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test443(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test443:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB443_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB443_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test444(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test444:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB444_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB444_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test445(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test445:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB445_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB445_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test446(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test446:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB446_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB446_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test447(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test447:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB447_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB447_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test448(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test448:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB448_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB448_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test449(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test449:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB449_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB449_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test450(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test450:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB450_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB450_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test451(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test451:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB451_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB451_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test452(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test452:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB452_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB452_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test453(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test453:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB453_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB453_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test454(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test454:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB454_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB454_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test455(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test455:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB455_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB455_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test456(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test456:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB456_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: or 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB456_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test457(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test457:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB457_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB457_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test458(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test458:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB458_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB458_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test459(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test459:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB459_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: or 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB459_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test460(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test460:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB460_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB460_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test461(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test461:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB461_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: stbcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB461_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test462(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test462:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB462_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB462_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test463(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test463:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB463_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB463_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test464(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test464:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB464_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stbcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB464_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test465(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test465:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB465_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB465_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test466(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test466:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB466_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: sthcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB466_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test467(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test467:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB467_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB467_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test468(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test468:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB468_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB468_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test469(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test469:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB469_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: sthcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB469_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test470(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test470:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB470_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB470_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test471(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test471:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB471_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: stwcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB471_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test472(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test472:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB472_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB472_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test473(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test473:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB473_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB473_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test474(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test474:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB474_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stwcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB474_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test475(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test475:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB475_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB475_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test476(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test476:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB476_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: xor 6, 4, 3
	; PPC64LE-NEXT: stdcx. 6, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB476_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test477(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test477:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB477_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB477_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test478(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test478:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB478_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB478_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test479(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test479:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB479_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: xor 6, 4, 5
	; PPC64LE-NEXT: stdcx. 6, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB479_1
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test480(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test480:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB480_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB480_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB480_1
	; PPC64LE-NEXT: .LBB480_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test481(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test481:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB481_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: extsb 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB481_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB481_1
	; PPC64LE-NEXT: .LBB481_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test482(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test482:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB482_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB482_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB482_1
	; PPC64LE-NEXT: .LBB482_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test483(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test483:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB483_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB483_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB483_1
	; PPC64LE-NEXT: .LBB483_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test484(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test484:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB484_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB484_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB484_1
	; PPC64LE-NEXT: .LBB484_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test485(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test485:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB485_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB485_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB485_1
	; PPC64LE-NEXT: .LBB485_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test486(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test486:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB486_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: extsh 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB486_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB486_1
	; PPC64LE-NEXT: .LBB486_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test487(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test487:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB487_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB487_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB487_1
	; PPC64LE-NEXT: .LBB487_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test488(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test488:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB488_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB488_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB488_1
	; PPC64LE-NEXT: .LBB488_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test489(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test489:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB489_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: ble 0, .LBB489_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB489_1
	; PPC64LE-NEXT: .LBB489_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test490(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test490:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB490_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB490_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB490_1
	; PPC64LE-NEXT: .LBB490_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test491(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test491:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB491_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmpw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB491_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB491_1
	; PPC64LE-NEXT: .LBB491_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test492(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test492:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB492_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB492_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB492_1
	; PPC64LE-NEXT: .LBB492_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test493(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test493:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB493_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB493_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB493_1
	; PPC64LE-NEXT: .LBB493_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test494(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test494:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB494_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB494_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB494_1
	; PPC64LE-NEXT: .LBB494_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test495(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test495:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB495_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB495_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB495_1
	; PPC64LE-NEXT: .LBB495_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test496(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test496:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB496_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpd 4, 3
	; PPC64LE-NEXT: ble 0, .LBB496_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB496_1
	; PPC64LE-NEXT: .LBB496_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test497(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test497:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB497_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB497_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB497_1
	; PPC64LE-NEXT: .LBB497_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test498(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test498:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB498_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB498_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB498_1
	; PPC64LE-NEXT: .LBB498_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test499(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test499:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB499_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: ble 0, .LBB499_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB499_1
	; PPC64LE-NEXT: .LBB499_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test500(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test500:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB500_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB500_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB500_1
	; PPC64LE-NEXT: .LBB500_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test501(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test501:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB501_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: extsb 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB501_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB501_1
	; PPC64LE-NEXT: .LBB501_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test502(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test502:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB502_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB502_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB502_1
	; PPC64LE-NEXT: .LBB502_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test503(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test503:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB503_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB503_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB503_1
	; PPC64LE-NEXT: .LBB503_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test504(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test504:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB504_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: extsb 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB504_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB504_1
	; PPC64LE-NEXT: .LBB504_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test505(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test505:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB505_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB505_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB505_1
	; PPC64LE-NEXT: .LBB505_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test506(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test506:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB506_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: extsh 6, 3
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB506_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB506_1
	; PPC64LE-NEXT: .LBB506_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test507(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test507:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB507_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB507_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB507_1
	; PPC64LE-NEXT: .LBB507_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test508(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test508:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB508_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB508_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB508_1
	; PPC64LE-NEXT: .LBB508_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test509(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test509:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB509_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: extsh 6, 5
	; PPC64LE-NEXT: cmpw 4, 6
	; PPC64LE-NEXT: bge 0, .LBB509_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB509_1
	; PPC64LE-NEXT: .LBB509_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test510(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test510:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB510_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB510_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB510_1
	; PPC64LE-NEXT: .LBB510_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test511(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test511:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB511_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmpw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB511_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB511_1
	; PPC64LE-NEXT: .LBB511_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test512(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test512:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB512_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB512_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB512_1
	; PPC64LE-NEXT: .LBB512_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test513(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test513:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB513_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB513_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB513_1
	; PPC64LE-NEXT: .LBB513_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test514(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test514:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB514_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmpw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB514_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB514_1
	; PPC64LE-NEXT: .LBB514_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test515(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test515:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB515_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB515_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB515_1
	; PPC64LE-NEXT: .LBB515_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test516(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test516:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB516_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpd 4, 3
	; PPC64LE-NEXT: bge 0, .LBB516_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB516_1
	; PPC64LE-NEXT: .LBB516_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test517(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test517:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB517_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB517_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB517_1
	; PPC64LE-NEXT: .LBB517_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test518(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test518:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB518_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB518_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB518_1
	; PPC64LE-NEXT: .LBB518_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test519(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test519:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB519_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpd 4, 5
	; PPC64LE-NEXT: bge 0, .LBB519_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB519_1
	; PPC64LE-NEXT: .LBB519_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test520(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test520:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB520_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB520_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB520_1
	; PPC64LE-NEXT: .LBB520_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test521(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test521:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB521_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB521_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB521_1
	; PPC64LE-NEXT: .LBB521_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test522(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test522:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB522_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB522_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB522_1
	; PPC64LE-NEXT: .LBB522_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test523(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test523:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB523_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB523_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB523_1
	; PPC64LE-NEXT: .LBB523_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test524(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test524:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB524_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB524_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB524_1
	; PPC64LE-NEXT: .LBB524_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test525(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test525:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB525_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB525_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB525_1
	; PPC64LE-NEXT: .LBB525_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test526(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test526:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB526_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB526_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB526_1
	; PPC64LE-NEXT: .LBB526_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test527(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test527:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB527_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB527_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB527_1
	; PPC64LE-NEXT: .LBB527_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test528(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test528:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB528_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB528_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB528_1
	; PPC64LE-NEXT: .LBB528_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test529(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test529:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB529_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB529_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB529_1
	; PPC64LE-NEXT: .LBB529_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test530(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test530:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB530_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB530_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB530_1
	; PPC64LE-NEXT: .LBB530_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test531(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test531:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB531_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: ble 0, .LBB531_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB531_1
	; PPC64LE-NEXT: .LBB531_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test532(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test532:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB532_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB532_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB532_1
	; PPC64LE-NEXT: .LBB532_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test533(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test533:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB533_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB533_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB533_1
	; PPC64LE-NEXT: .LBB533_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test534(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test534:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB534_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: ble 0, .LBB534_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB534_1
	; PPC64LE-NEXT: .LBB534_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test535(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test535:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB535_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB535_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB535_1
	; PPC64LE-NEXT: .LBB535_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test536(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test536:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB536_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpld 4, 3
	; PPC64LE-NEXT: ble 0, .LBB536_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB536_1
	; PPC64LE-NEXT: .LBB536_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test537(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test537:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB537_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB537_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB537_1
	; PPC64LE-NEXT: .LBB537_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test538(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test538:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB538_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB538_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB538_1
	; PPC64LE-NEXT: .LBB538_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test539(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test539:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB539_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: ble 0, .LBB539_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB539_1
	; PPC64LE-NEXT: .LBB539_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	define i8 @test540(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test540:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB540_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB540_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB540_1
	; PPC64LE-NEXT: .LBB540_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") monotonic
	ret i8 %ret
	}

	define i8 @test541(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test541:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB541_1:
	; PPC64LE-NEXT: lbarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB541_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB541_1
	; PPC64LE-NEXT: .LBB541_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acquire
	ret i8 %ret
	}

	define i8 @test542(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test542:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB542_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB542_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB542_1
	; PPC64LE-NEXT: .LBB542_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") release
	ret i8 %ret
	}

	define i8 @test543(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test543:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB543_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB543_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB543_1
	; PPC64LE-NEXT: .LBB543_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acq_rel
	ret i8 %ret
	}

	define i8 @test544(i8* %ptr, i8 %val) {
	; PPC64LE-LABEL: test544:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB544_1:
	; PPC64LE-NEXT: lbarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB544_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stbcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB544_1
	; PPC64LE-NEXT: .LBB544_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") seq_cst
	ret i8 %ret
	}

	define i16 @test545(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test545:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB545_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB545_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB545_1
	; PPC64LE-NEXT: .LBB545_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") monotonic
	ret i16 %ret
	}

	define i16 @test546(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test546:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB546_1:
	; PPC64LE-NEXT: lharx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB546_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB546_1
	; PPC64LE-NEXT: .LBB546_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acquire
	ret i16 %ret
	}

	define i16 @test547(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test547:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB547_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB547_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB547_1
	; PPC64LE-NEXT: .LBB547_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") release
	ret i16 %ret
	}

	define i16 @test548(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test548:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB548_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB548_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB548_1
	; PPC64LE-NEXT: .LBB548_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acq_rel
	ret i16 %ret
	}

	define i16 @test549(i16* %ptr, i16 %val) {
	; PPC64LE-LABEL: test549:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB549_1:
	; PPC64LE-NEXT: lharx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB549_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: sthcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB549_1
	; PPC64LE-NEXT: .LBB549_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") seq_cst
	ret i16 %ret
	}

	define i32 @test550(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test550:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB550_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB550_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB550_1
	; PPC64LE-NEXT: .LBB550_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") monotonic
	ret i32 %ret
	}

	define i32 @test551(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test551:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB551_1:
	; PPC64LE-NEXT: lwarx 3, 0, 5
	; PPC64LE-NEXT: cmplw 4, 3
	; PPC64LE-NEXT: bge 0, .LBB551_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB551_1
	; PPC64LE-NEXT: .LBB551_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acquire
	ret i32 %ret
	}

	define i32 @test552(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test552:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB552_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB552_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB552_1
	; PPC64LE-NEXT: .LBB552_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") release
	ret i32 %ret
	}

	define i32 @test553(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test553:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB553_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB553_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB553_1
	; PPC64LE-NEXT: .LBB553_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acq_rel
	ret i32 %ret
	}

	define i32 @test554(i32* %ptr, i32 %val) {
	; PPC64LE-LABEL: test554:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB554_1:
	; PPC64LE-NEXT: lwarx 5, 0, 3
	; PPC64LE-NEXT: cmplw 4, 5
	; PPC64LE-NEXT: bge 0, .LBB554_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stwcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB554_1
	; PPC64LE-NEXT: .LBB554_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") seq_cst
	ret i32 %ret
	}

	define i64 @test555(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test555:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: .LBB555_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB555_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB555_1
	; PPC64LE-NEXT: .LBB555_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") monotonic
	ret i64 %ret
	}

	define i64 @test556(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test556:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: mr 5, 3
	; PPC64LE-NEXT: .LBB556_1:
	; PPC64LE-NEXT: ldarx 3, 0, 5
	; PPC64LE-NEXT: cmpld 4, 3
	; PPC64LE-NEXT: bge 0, .LBB556_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 5
	; PPC64LE-NEXT: bne 0, .LBB556_1
	; PPC64LE-NEXT: .LBB556_3:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acquire
	ret i64 %ret
	}

	define i64 @test557(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test557:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB557_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB557_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB557_1
	; PPC64LE-NEXT: .LBB557_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") release
	ret i64 %ret
	}

	define i64 @test558(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test558:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: .LBB558_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB558_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB558_1
	; PPC64LE-NEXT: .LBB558_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acq_rel
	ret i64 %ret
	}

	define i64 @test559(i64* %ptr, i64 %val) {
	; PPC64LE-LABEL: test559:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: sync
	; PPC64LE-NEXT: .LBB559_1:
	; PPC64LE-NEXT: ldarx 5, 0, 3
	; PPC64LE-NEXT: cmpld 4, 5
	; PPC64LE-NEXT: bge 0, .LBB559_3
	; PPC64LE-NEXT: # %bb.2:
	; PPC64LE-NEXT: stdcx. 4, 0, 3
	; PPC64LE-NEXT: bne 0, .LBB559_1
	; PPC64LE-NEXT: .LBB559_3:
	; PPC64LE-NEXT: mr 3, 5
	; PPC64LE-NEXT: lwsync
	; PPC64LE-NEXT: blr
	%ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") seq_cst
	ret i64 %ret
	}

	; The second load should never be scheduled before isync.
	define i32 @test_ordering0(i32* %ptr1, i32* %ptr2) {
	; PPC64LE-LABEL: test_ordering0:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwz 4, 0(3)
	; PPC64LE-NEXT: cmpd 7, 4, 4
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: lwz 3, 0(3)
	; PPC64LE-NEXT: add 3, 4, 3
	; PPC64LE-NEXT: blr
	%val1 = load atomic i32, i32* %ptr1 acquire, align 4
	%val2 = load i32, i32* %ptr1
	%add = add i32 %val1, %val2
	ret i32 %add
	}

	; The second store should never be scheduled before isync.
	define i32 @test_ordering1(i32* %ptr1, i32 %val1, i32* %ptr2) {
	; PPC64LE-LABEL: test_ordering1:
	; PPC64LE: # %bb.0:
	; PPC64LE-NEXT: lwz 3, 0(3)
	; PPC64LE-NEXT: cmpd 7, 3, 3
	; PPC64LE-NEXT: bne- 7, .+4
	; PPC64LE-NEXT: isync
	; PPC64LE-NEXT: stw 4, 0(5)
	; PPC64LE-NEXT: blr
	%val2 = load atomic i32, i32* %ptr1 acquire, align 4
	store i32 %val1, i32* %ptr2
	ret i32 %val2
	}
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (revision 328362)
	@@ -1,4782 +1,4821 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - \| FileCheck %s

	; FIXME: All cases here should be fixed by PR34380

	define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
	; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5,6],xmm3[7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
	; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
	; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
	; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm0
	; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,0]
	; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,0]
	; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6],xmm2[7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,0]
	; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6],xmm1[7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
	; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
	; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
	; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
	; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm0
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
	; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
	; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
	; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i16>, <16 x i16>* %vp
	%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
	; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
	; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
	ret <16 x i16> %res
	}
	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}
	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}
	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}
	define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
	; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
	ret <16 x i16> %res
	}
	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
	; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}
	define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
	; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
	; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}
	define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
	; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
	; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
	ret <16 x i16> %res
	}
	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
	; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}

	define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
	; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
	; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
	ret <16 x i16> %res
	}
	define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
	ret <16 x i16> %res
	}

	define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
	%cmp = icmp eq <16 x i16> %mask, zeroinitializer
	%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
	ret <16 x i16> %res
	}

	define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
	; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
	; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
	; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
	ret <8 x i16> %res
	}
	define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
	ret <8 x i16> %res
	}

	define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
	; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
	; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <32 x i16>, <32 x i16>* %vp
	%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
	%cmp = icmp eq <8 x i16> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
	ret <8 x i16> %res
	}

	define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
	; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,2]
	; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
	; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
	; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm2
	; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm1
	; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
	; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
	; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i32>, <8 x i32>* %vp
	%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
	; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,5,3,6,15,2,9,14]
	; CHECK-NEXT: vpermi2d %ymm0, %ymm2, %ymm1
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
	ret <8 x i32> %res
	}
	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
	; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}
	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}
	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}
	define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
	; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
	ret <8 x i32> %res
	}
	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}
	define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
	; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
	; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
	; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
	; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <3,0,0,13,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
	; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}
	define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
	; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
	; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
	ret <8 x i32> %res
	}
	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}

	define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
	; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
	ret <8 x i32> %res
	}
	define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12]
	; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
	ret <8 x i32> %res
	}

	define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
	%cmp = icmp eq <8 x i32> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
	ret <8 x i32> %res
	}

	define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
	; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6]
	; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
	; CHECK-NEXT: vpermi2d %ymm1, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
	; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
	; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0
	; CHECK-NEXT: vmovd %xmm0, %eax
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
	; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
	; CHECK-NEXT: vpextrd $3, %xmm1, %eax
	; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1
	; CHECK-NEXT: vpextrd $2, %xmm0, %eax
	; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
	ret <4 x i32> %res
	}
	define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
	; CHECK-NEXT: vmovd %xmm2, %eax
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
	; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
	; CHECK-NEXT: vpextrd $3, %xmm3, %eax
	; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
	; CHECK-NEXT: vpextrd $2, %xmm2, %eax
	; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
	ret <4 x i32> %res
	}

	define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
	; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
	; CHECK-NEXT: vmovd %xmm1, %eax
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
	; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
	; CHECK-NEXT: vpextrd $3, %xmm2, %eax
	; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
	; CHECK-NEXT: vpextrd $2, %xmm1, %eax
	; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x i32>, <16 x i32>* %vp
	%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
	%cmp = icmp eq <4 x i32> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
	ret <4 x i32> %res
	}

	define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
	; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
	ret <2 x i64> %res
	}
	define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}
	define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}
	define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
	; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x i64>, <4 x i64>* %vp
	%res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
	ret <2 x i64> %res
	}
	define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm3[1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x i64>, <4 x i64>* %vp
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x i64>, <4 x i64>* %vp
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x i64>, <4 x i64>* %vp
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x i64>, <4 x i64>* %vp
	%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}

	define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
	; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
	; CHECK-NEXT: retq
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
	; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
	; CHECK-NEXT: retq
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
	; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6]
	; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
	; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
	; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
	; CHECK-NEXT: retq
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
	; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
	; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}
	define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
	; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
	ret <2 x i64> %res
	}
	define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}
	define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}
	define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
	; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,0,2]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,1,1,5]
	; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,1,1,5]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
	; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm0
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,2]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[3,0,0,2]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,0,2]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,6,1]
	; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
	; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
	; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
	ret <4 x i64> %res
	}
	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,2,3,2]
	; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
	; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,1,3]
	; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
	; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
	ret <4 x i64> %res
	}

	define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
	; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
	; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
	%cmp = icmp eq <4 x i64> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
	ret <4 x i64> %res
	}

	define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
	; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm0
	; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
	ret <2 x i64> %res
	}
	define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm2[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti32x4 $2, %zmm1, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm3
	; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
	ret <2 x i64> %res
	}

	define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
	; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x i64>, <8 x i64>* %vp
	%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
	%cmp = icmp eq <2 x i64> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
	ret <2 x i64> %res
	}

	define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
	; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
	; CHECK-NEXT: vmovaps %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],xmm3[0,2]
	; CHECK-NEXT: vmovaps %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm2[0,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[0,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,2],xmm3[0,2]
	; CHECK-NEXT: vmovaps %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[0,0]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2],xmm2[0,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
	; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,1,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[3,3,1,2]
	; CHECK-NEXT: vmovaps %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,1,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
	; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm2
	; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm1
	; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm2
	; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm1
	; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
	; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm2
	; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %ymm1
	; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
	; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x float>, <8 x float>* %vp
	%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
	; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovaps %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
	ret <8 x float> %res
	}
	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,12,10,8,2,11,7]
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
	; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}
	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [2,4,11,4,12,7,9,6]
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
	; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,4,11,4,12,7,9,6]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}
	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0]
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,10,11,6,1,4,4]
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
	; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm2[0,0]
	; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,10,11,6,1,4,4]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}
	define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
	; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,6,1,8,4,12,13,0]
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
	; CHECK-NEXT: vmovaps %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
	ret <8 x float> %res
	}
	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,6,1,8,4,12,13,0]
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
	; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,6,1,8,4,12,13,0]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}
	define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
	; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <12,0,1,2,u,u,u,u>
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
	; CHECK-NEXT: vmovaps %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <12,0,1,2,u,u,u,u>
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
	; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u>
	; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqps %xmm0, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
	; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
	; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
	; CHECK-NEXT: vmovaps %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
	; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,1,3,3]
	; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
	; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
	; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
	; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
	; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}
	define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
	; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
	ret <8 x float> %res
	}
	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [7,6,7,11,5,10,0,4]
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [11,0,9,0,7,14,0,8]
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[1,0,0,3]
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [8,5,2,3,2,9,10,1]
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,0,0,3]
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [8,5,2,3,2,9,10,1]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}

	define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
	; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
	; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
	ret <8 x float> %res
	}
	define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [7,5,3,3,11,4,12,9]
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
	ret <8 x float> %res
	}

	define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovaps %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
	%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
	%res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
	ret <8 x float> %res
	}

	define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
	; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3]
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
	; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
	; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
	; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
	; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
	; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
	; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
	; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
	; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15]
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
	; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7]
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
	; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
	; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u>
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
	ret <4 x float> %res
	}
	define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <3,3,15,9,u,u,u,u>
	; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
	ret <4 x float> %res
	}

	define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
	; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovaps (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
	; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm3
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
	; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <16 x float>, <16 x float>* %vp
	%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
	%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
	ret <4 x float> %res
	}

	define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
	; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
	ret <2 x double> %res
	}
	define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
	; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm3[0],xmm0[0]
	; CHECK-NEXT: vmovapd %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm0[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}
	define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
	; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[1]
	; CHECK-NEXT: vmovapd %xmm1, %xmm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}
	define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
	; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x double>, <4 x double>* %vp
	%res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
	ret <2 x double> %res
	}
	define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %ymm2
	; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x double>, <4 x double>* %vp
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %ymm1
	; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x double>, <4 x double>* %vp
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %ymm2
	; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1
	; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x double>, <4 x double>* %vp
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %ymm1
	; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1
	; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <4 x double>, <4 x double>* %vp
	%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}

	define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
	; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
	; CHECK-NEXT: retq
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,0,7,6]
	; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
	; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovapd %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
	; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,4]
	; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,4]
	; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
	; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
	; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
	; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
	; CHECK-NEXT: vmovapd %ymm2, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,2]
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,2]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
	; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
	; CHECK-NEXT: retq
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,1]
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,1,0,2]
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,1,0,2]
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}
	define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
	; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
	; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
	ret <2 x double> %res
	}
	define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
	; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}
	define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
	; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
	; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
	; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}
	define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
	; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
	; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,7,2]
	; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[1,2,3,0]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1,2,3,0]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
	; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm0
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,0]
	; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,0]
	; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0]
	; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,4,1,5]
	; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
	; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
	; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
	; CHECK-NEXT: vmovapd %ymm1, %ymm0
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
	; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm0
	; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
	; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
	ret <4 x double> %res
	}
	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
	; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1]
	; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
	; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,1]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
	ret <4 x double> %res
	}

	define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,1]
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
	%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
	%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
	ret <4 x double> %res
	}

	define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
	; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm0
	; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
	; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
	; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
	ret <2 x double> %res
	}
	define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
	; CHECK-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
	; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
	; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
	; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
	; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
	; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm2
	; CHECK-NEXT: vextractf32x4 $2, %zmm2, %xmm3
	; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1
	; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
	ret <2 x double> %res
	}

	define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
	; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
	; CHECK: # %bb.0:
	; CHECK-NEXT: vmovapd (%rdi), %zmm1
	; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm2
	; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1
	; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[0]
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq
	%vec = load <8 x double>, <8 x double>* %vp
	%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
	%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
	%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
	ret <2 x double> %res
	}

	+; PR35977
	+define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
	+; CHECK-LABEL: test_zext_v8i8_to_v8i16:
	+; CHECK: # %bb.0:
	+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
	+; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
	+; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
	+; CHECK-NEXT: retq
	+ %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
	+ %tmp2 = load <8 x i8>, <8 x i8>* %tmp
	+ %tmp3 = extractelement <8 x i8> %tmp2, i32 0
	+ %tmp4 = zext i8 %tmp3 to i16
	+ %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
	+ %tmp6 = extractelement <8 x i8> %tmp2, i32 1
	+ %tmp7 = zext i8 %tmp6 to i16
	+ %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
	+ %tmp9 = extractelement <8 x i8> %tmp2, i32 2
	+ %tmp10 = zext i8 %tmp9 to i16
	+ %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
	+ %tmp12 = extractelement <8 x i8> %tmp2, i32 3
	+ %tmp13 = zext i8 %tmp12 to i16
	+ %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
	+ %tmp15 = extractelement <8 x i8> %tmp2, i32 4
	+ %tmp16 = zext i8 %tmp15 to i16
	+ %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
	+ %tmp18 = extractelement <8 x i8> %tmp2, i32 5
	+ %tmp19 = zext i8 %tmp18 to i16
	+ %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
	+ %tmp21 = extractelement <8 x i8> %tmp2, i32 6
	+ %tmp22 = zext i8 %tmp21 to i16
	+ %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
	+ %tmp24 = extractelement <8 x i8> %tmp2, i32 7
	+ %tmp25 = zext i8 %tmp24 to i16
	+ %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
	+ %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
	+ %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
	+ store <8 x i16> %tmp27, <8 x i16>* %tmp28
	+ ret void
	+}
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/darwin-bzero.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/darwin-bzero.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/darwin-bzero.ll (revision 328362)
	@@ -1,11 +1,14 @@
	-; RUN: llc < %s -mtriple=i386-apple-darwin10 \| FileCheck %s
	-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 \| FileCheck %s
	+; RUN: llc < %s -mtriple=i386-apple-darwin10 \| FileCheck -check-prefixes=CHECK,BZERO %s
	+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 \| FileCheck -check-prefixes=CHECK,BZERO %s
	+; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck -check-prefixes=CHECK,NOBZERO %s
	+; RUN: llc < %s -mtriple=x86_64-apple-ios10.0-simulator \| FileCheck -check-prefixes=CHECK,NOBZERO %s

	declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind

	; CHECK-LABEL: foo:
	-; CHECK: {{calll\|callq}} ___bzero
	+; BZERO: {{calll\|callq}} ___bzero
	+; NOBZERO-NOT: bzero
	define void @foo(i8* %p, i32 %len) {
	call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false)
	ret void
	}
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/inline-asm-A-constraint.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/inline-asm-A-constraint.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/inline-asm-A-constraint.ll (revision 328362)
	@@ -1,34 +1,35 @@
	; RUN: llc -mtriple=x86_64-- < %s \| FileCheck %s

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64--"

	; Function Attrs: nounwind uwtable
	define { i64, i64 } @foo(i8* %ptr, i128* nocapture readonly %src, i128* nocapture readonly %dst) local_unnamed_addr #0 {
	entry:
	%0 = load i128, i128* %dst, align 16, !tbaa !1
	%shr = lshr i128 %0, 64
	%conv = trunc i128 %shr to i64
	%conv1 = trunc i128 %0 to i64
	%1 = load i128, i128* %src, align 16, !tbaa !1
	%2 = tail call i128 asm sideeffect "lock; cmpxchg16b $1", "=A,=m,{cx},{bx},0,m,~{dirflag},~{fpsr},~{flags}"(i8* %ptr, i64 %conv, i64 %conv1, i128 %1, i8* %ptr) #1, !srcloc !5
	%retval.sroa.0.0.extract.trunc = trunc i128 %2 to i64
	%retval.sroa.2.0.extract.shift = lshr i128 %2, 64
	%retval.sroa.2.0.extract.trunc = trunc i128 %retval.sroa.2.0.extract.shift to i64
	%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.extract.trunc, 0
	%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
	ret { i64, i64 } %.fca.1.insert
	}
	-; CHECK: lock cmpxchg16b
	+; CHECK: lock
	+; CHECK-NEXT: cmpxchg16b

	attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #1 = { nounwind }

	!llvm.ident = !{!0}

	!0 = !{!"clang version 5.0.0 (trunk 300088)"}
	!1 = !{!2, !2, i64 0}
	!2 = !{!"__int128", !3, i64 0}
	!3 = !{!"omnipotent char", !4, i64 0}
	!4 = !{!"Simple C/C++ TBAA"}
	!5 = !{i32 269}
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/pr35761.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/pr35761.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/pr35761.ll (revision 328362)
	@@ -0,0 +1,36 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc -mtriple=x86_64-unknown-linux %s -o - \| FileCheck %s
	+
	+@x = global i8 0, align 1
	+@y = global i32 0, align 4
	+@z = global i24 0, align 4
	+
	+define void @PR35761(i32 %call) {
	+; CHECK-LABEL: PR35761:
	+; CHECK: # %bb.0: # %entry
	+; CHECK-NEXT: movzbl {{.*}}(%rip), %eax
	+; CHECK-NEXT: andl $1, %eax
	+; CHECK-NEXT: movzbl {{.*}}(%rip), %ecx
	+; CHECK-NEXT: xorl $255, %ecx
	+; CHECK-NEXT: orl %eax, %ecx
	+; CHECK-NEXT: movw %cx, {{.*}}(%rip)
	+; CHECK-NEXT: movb $0, z+{{.*}}(%rip)
	+; CHECK-NEXT: retq
	+entry:
	+ %0 = load i8, i8* @x, align 1
	+ %tobool = trunc i8 %0 to i1
	+ %conv = zext i1 %tobool to i32
	+ %or = or i32 32767, %call
	+ %neg = xor i32 %or, -1
	+ %neg1 = xor i32 %neg, -1
	+ %1 = load i32, i32* @y, align 4
	+ %xor = xor i32 %neg1, %1
	+ %or2 = or i32 %conv, %xor
	+ %conv3 = trunc i32 %or2 to i8
	+ %bf.load = load i24, i24* @z, align 4
	+ %2 = zext i8 %conv3 to i24
	+ %bf.value = and i24 %2, 4194303
	+ store i24 %bf.value, i24* @z, align 2
	+ ret void
	+}
	+
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/pr35972.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/pr35972.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/pr35972.ll (revision 328362)
	@@ -0,0 +1,20 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc -mtriple=i686-unknown-linux-gnu %s -o - -mattr=avx512bw \| FileCheck %s
	+
	+define void @test3(i32 %c, <64 x i1>* %ptr) {
	+; CHECK-LABEL: test3:
	+; CHECK: # %bb.0:
	+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
	+; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp)
	+; CHECK-NEXT: sbbl %ecx, %ecx
	+; CHECK-NEXT: kmovd %ecx, %k0
	+; CHECK-NEXT: kunpckdq %k0, %k0, %k0
	+; CHECK-NEXT: kmovq %k0, (%eax)
	+; CHECK-NEXT: retl
	+ %cmp = icmp eq i32 %c, 0
	+ %insert = insertelement <64 x i1> undef, i1 %cmp, i32 0
	+ %shuf = shufflevector <64 x i1> %insert, <64 x i1> undef, <64 x i32> zeroinitializer
	+ store <64 x i1> %shuf, <64 x i1>* %ptr
	+ ret void
	+}
	+
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/pr37563.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/pr37563.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/pr37563.ll (revision 328362)
	@@ -0,0 +1,42 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - \| FileCheck %s
	+
	+%struct.S = type <{ i16, i24, [5 x i8], i8, i16, [2 x i8] }>
	+
	+@z = global { i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] } { i16 -724, i8 94, i8 -18, i8 5, i8 undef, i8 96, i8 104, i8 -24, i8 10, i8 0, [5 x i8] undef }, align 8
	+@tf_3_var_136 = global i64 0, align 8
	+@.str = private unnamed_addr constant [6 x i8] c"%llu\0A\00", align 1
	+
	+define void @PR35763() {
	+; CHECK-LABEL: PR35763:
	+; CHECK: # %bb.0: # %entry
	+; CHECK-NEXT: movzwl {{.*}}(%rip), %eax
	+; CHECK-NEXT: movzwl z+{{.*}}(%rip), %ecx
	+; CHECK-NEXT: orl %eax, %ecx
	+; CHECK-NEXT: movq %rcx, {{.*}}(%rip)
	+; CHECK-NEXT: movl z+{{.*}}(%rip), %eax
	+; CHECK-NEXT: movzbl z+{{.*}}(%rip), %ecx
	+; CHECK-NEXT: shlq $32, %rcx
	+; CHECK-NEXT: orq %rax, %rcx
	+; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
	+; CHECK-NEXT: andq %rcx, %rax
	+; CHECK-NEXT: movl %eax, z+{{.*}}(%rip)
	+; CHECK-NEXT: shrq $32, %rax
	+; CHECK-NEXT: movb %al, z+{{.*}}(%rip)
	+; CHECK-NEXT: retq
	+entry:
	+ %0 = load i16, i16* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 0), align 8
	+ %conv = sext i16 %0 to i32
	+ %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S), i32 0, i32 1) to i32), align 2
	+ %bf.clear = and i32 %bf.load, 2097151
	+ %bf.cast = zext i32 %bf.clear to i64
	+ %conv1 = trunc i64 %bf.cast to i32
	+ %or = or i32 %conv, %conv1
	+ %conv2 = trunc i32 %or to i16
	+ %conv3 = zext i16 %conv2 to i64
	+ store i64 %conv3, i64* @tf_3_var_136, align 8
	+ %bf.load4 = load i40, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S), i32 0, i32 2) to i40), align 2
	+ %bf.clear5 = and i40 %bf.load4, -8589869057
	+ store i40 %bf.clear5, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S), i32 0, i32 2) to i40), align 2
	+ ret void
	+}
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/var-permute-128.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/var-permute-128.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/var-permute-128.ll (revision 328362)
	@@ -1,356 +1,355 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 \| FileCheck %s --check-prefix=SSSE3
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f \| FileCheck %s --check-prefixes=AVX,AVX512,AVXNOVLBW,AVX512F
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl \| FileCheck %s --check-prefixes=AVX,AVX512,AVXNOVLBW,AVX512VL
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl \| FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi \| FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW,VBMI

	define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
	; SSSE3-LABEL: var_shuffle_v2i64:
	; SSSE3: # %bb.0:
	; SSSE3-NEXT: movq %xmm1, %rax
	; SSSE3-NEXT: andl $1, %eax
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; SSSE3-NEXT: movq %xmm1, %rcx
	; SSSE3-NEXT: andl $1, %ecx
	; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
	; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
	; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSSE3-NEXT: retq
	;
	; AVX-LABEL: var_shuffle_v2i64:
	; AVX: # %bb.0:
	; AVX-NEXT: vmovq %xmm1, %rax
	; AVX-NEXT: andl $1, %eax
	; AVX-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX-NEXT: andl $1, %ecx
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; AVX-NEXT: retq
	%index0 = extractelement <2 x i64> %indices, i32 0
	%index1 = extractelement <2 x i64> %indices, i32 1
	%v0 = extractelement <2 x i64> %v, i64 %index0
	%v1 = extractelement <2 x i64> %v, i64 %index1
	%ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
	%ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
	ret <2 x i64> %ret1
	}

	define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
	; SSSE3-LABEL: var_shuffle_v4i32:
	; SSSE3: # %bb.0:
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
	; SSSE3-NEXT: movq %xmm2, %rax
	; SSSE3-NEXT: movq %rax, %rcx
	; SSSE3-NEXT: sarq $32, %rcx
	; SSSE3-NEXT: movq %xmm1, %rdx
	; SSSE3-NEXT: movq %rdx, %rsi
	; SSSE3-NEXT: sarq $32, %rsi
	; SSSE3-NEXT: andl $3, %edx
	; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: andl $3, %esi
	; SSSE3-NEXT: andl $3, %eax
	; SSSE3-NEXT: andl $3, %ecx
	; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSSE3-NEXT: retq
	;
	; AVX-LABEL: var_shuffle_v4i32:
	; AVX: # %bb.0:
	; AVX-NEXT: vpextrq $1, %xmm1, %rax
	; AVX-NEXT: movq %rax, %rcx
	; AVX-NEXT: sarq $32, %rcx
	; AVX-NEXT: vmovq %xmm1, %rdx
	; AVX-NEXT: movq %rdx, %rsi
	; AVX-NEXT: sarq $32, %rsi
	; AVX-NEXT: andl $3, %edx
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $3, %esi
	; AVX-NEXT: andl $3, %eax
	; AVX-NEXT: andl $3, %ecx
	; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
	; AVX-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
	; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
	; AVX-NEXT: retq
	%index0 = extractelement <4 x i32> %indices, i32 0
	%index1 = extractelement <4 x i32> %indices, i32 1
	%index2 = extractelement <4 x i32> %indices, i32 2
	%index3 = extractelement <4 x i32> %indices, i32 3
	%v0 = extractelement <4 x i32> %v, i32 %index0
	%v1 = extractelement <4 x i32> %v, i32 %index1
	%v2 = extractelement <4 x i32> %v, i32 %index2
	%v3 = extractelement <4 x i32> %v, i32 %index3
	%ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
	%ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
	%ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
	%ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
	ret <4 x i32> %ret3
	}

	define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
	; SSSE3-LABEL: var_shuffle_v8i16:
	; SSSE3: # %bb.0:
	; SSSE3-NEXT: movd %xmm1, %r8d
	; SSSE3-NEXT: pextrw $1, %xmm1, %r9d
	; SSSE3-NEXT: pextrw $2, %xmm1, %r10d
	; SSSE3-NEXT: pextrw $3, %xmm1, %esi
	; SSSE3-NEXT: pextrw $4, %xmm1, %edi
	; SSSE3-NEXT: pextrw $5, %xmm1, %eax
	; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
	; SSSE3-NEXT: pextrw $7, %xmm1, %edx
	; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: andl $7, %r8d
	; SSSE3-NEXT: andl $7, %r9d
	; SSSE3-NEXT: andl $7, %r10d
	; SSSE3-NEXT: andl $7, %esi
	; SSSE3-NEXT: andl $7, %edi
	; SSSE3-NEXT: andl $7, %eax
	; SSSE3-NEXT: andl $7, %ecx
	; SSSE3-NEXT: andl $7, %edx
	; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
	; SSSE3-NEXT: movd %edx, %xmm0
	; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
	; SSSE3-NEXT: movd %ecx, %xmm1
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax
	; SSSE3-NEXT: movd %eax, %xmm2
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
	; SSSE3-NEXT: movd %eax, %xmm1
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
	; SSSE3-NEXT: movd %eax, %xmm3
	; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
	; SSSE3-NEXT: retq
	;
	; AVXNOVLBW-LABEL: var_shuffle_v8i16:
	; AVXNOVLBW: # %bb.0:
	; AVXNOVLBW-NEXT: vmovd %xmm1, %eax
	; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d
	; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx
	; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx
	; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi
	; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi
	; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d
	; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d
	; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVXNOVLBW-NEXT: andl $7, %eax
	; AVXNOVLBW-NEXT: andl $7, %r10d
	; AVXNOVLBW-NEXT: andl $7, %ecx
	; AVXNOVLBW-NEXT: andl $7, %edx
	; AVXNOVLBW-NEXT: andl $7, %esi
	; AVXNOVLBW-NEXT: andl $7, %edi
	; AVXNOVLBW-NEXT: andl $7, %r8d
	; AVXNOVLBW-NEXT: andl $7, %r9d
	; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax
	; AVXNOVLBW-NEXT: vmovd %eax, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
	; AVXNOVLBW-NEXT: retq
	;
	; AVX512VLBW-LABEL: var_shuffle_v8i16:
	; AVX512VLBW: # %bb.0:
	; AVX512VLBW-NEXT: vpermw %xmm0, %xmm1, %xmm0
	; AVX512VLBW-NEXT: retq
	%index0 = extractelement <8 x i16> %indices, i32 0
	%index1 = extractelement <8 x i16> %indices, i32 1
	%index2 = extractelement <8 x i16> %indices, i32 2
	%index3 = extractelement <8 x i16> %indices, i32 3
	%index4 = extractelement <8 x i16> %indices, i32 4
	%index5 = extractelement <8 x i16> %indices, i32 5
	%index6 = extractelement <8 x i16> %indices, i32 6
	%index7 = extractelement <8 x i16> %indices, i32 7
	%v0 = extractelement <8 x i16> %v, i16 %index0
	%v1 = extractelement <8 x i16> %v, i16 %index1
	%v2 = extractelement <8 x i16> %v, i16 %index2
	%v3 = extractelement <8 x i16> %v, i16 %index3
	%v4 = extractelement <8 x i16> %v, i16 %index4
	%v5 = extractelement <8 x i16> %v, i16 %index5
	%v6 = extractelement <8 x i16> %v, i16 %index6
	%v7 = extractelement <8 x i16> %v, i16 %index7
	%ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
	%ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
	%ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
	%ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
	%ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
	%ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
	%ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
	%ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
	ret <8 x i16> %ret7
	}

	define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
	; SSSE3-LABEL: var_shuffle_v16i8:
	; SSSE3: # %bb.0:
	-; SSSE3-NEXT: pshufb %xmm0, %xmm1
	-; SSSE3-NEXT: movdqa %xmm1, %xmm0
	+; SSSE3-NEXT: pshufb %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; AVX-LABEL: var_shuffle_v16i8:
	; AVX: # %bb.0:
	-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
	+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX-NEXT: retq
	%index0 = extractelement <16 x i8> %indices, i32 0
	%index1 = extractelement <16 x i8> %indices, i32 1
	%index2 = extractelement <16 x i8> %indices, i32 2
	%index3 = extractelement <16 x i8> %indices, i32 3
	%index4 = extractelement <16 x i8> %indices, i32 4
	%index5 = extractelement <16 x i8> %indices, i32 5
	%index6 = extractelement <16 x i8> %indices, i32 6
	%index7 = extractelement <16 x i8> %indices, i32 7
	%index8 = extractelement <16 x i8> %indices, i32 8
	%index9 = extractelement <16 x i8> %indices, i32 9
	%index10 = extractelement <16 x i8> %indices, i32 10
	%index11 = extractelement <16 x i8> %indices, i32 11
	%index12 = extractelement <16 x i8> %indices, i32 12
	%index13 = extractelement <16 x i8> %indices, i32 13
	%index14 = extractelement <16 x i8> %indices, i32 14
	%index15 = extractelement <16 x i8> %indices, i32 15
	%v0 = extractelement <16 x i8> %v, i8 %index0
	%v1 = extractelement <16 x i8> %v, i8 %index1
	%v2 = extractelement <16 x i8> %v, i8 %index2
	%v3 = extractelement <16 x i8> %v, i8 %index3
	%v4 = extractelement <16 x i8> %v, i8 %index4
	%v5 = extractelement <16 x i8> %v, i8 %index5
	%v6 = extractelement <16 x i8> %v, i8 %index6
	%v7 = extractelement <16 x i8> %v, i8 %index7
	%v8 = extractelement <16 x i8> %v, i8 %index8
	%v9 = extractelement <16 x i8> %v, i8 %index9
	%v10 = extractelement <16 x i8> %v, i8 %index10
	%v11 = extractelement <16 x i8> %v, i8 %index11
	%v12 = extractelement <16 x i8> %v, i8 %index12
	%v13 = extractelement <16 x i8> %v, i8 %index13
	%v14 = extractelement <16 x i8> %v, i8 %index14
	%v15 = extractelement <16 x i8> %v, i8 %index15
	%ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
	%ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
	%ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
	%ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
	%ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
	%ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
	%ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
	%ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
	%ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
	%ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
	%ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
	%ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
	%ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
	%ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
	%ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
	%ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
	ret <16 x i8> %ret15
	}

	define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
	; SSSE3-LABEL: var_shuffle_v2f64:
	; SSSE3: # %bb.0:
	; SSSE3-NEXT: movq %xmm1, %rax
	; SSSE3-NEXT: andl $1, %eax
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; SSSE3-NEXT: movq %xmm1, %rcx
	; SSSE3-NEXT: andl $1, %ecx
	; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
	; SSSE3-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
	; SSSE3-NEXT: retq
	;
	; AVX-LABEL: var_shuffle_v2f64:
	; AVX: # %bb.0:
	; AVX-NEXT: vmovq %xmm1, %rax
	; AVX-NEXT: andl $1, %eax
	; AVX-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX-NEXT: andl $1, %ecx
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
	; AVX-NEXT: retq
	%index0 = extractelement <2 x i64> %indices, i32 0
	%index1 = extractelement <2 x i64> %indices, i32 1
	%v0 = extractelement <2 x double> %v, i64 %index0
	%v1 = extractelement <2 x double> %v, i64 %index1
	%ret0 = insertelement <2 x double> undef, double %v0, i32 0
	%ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
	ret <2 x double> %ret1
	}

	define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
	; SSSE3-LABEL: var_shuffle_v4f32:
	; SSSE3: # %bb.0:
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
	; SSSE3-NEXT: movq %xmm2, %rax
	; SSSE3-NEXT: movq %rax, %rcx
	; SSSE3-NEXT: sarq $32, %rcx
	; SSSE3-NEXT: movq %xmm1, %rdx
	; SSSE3-NEXT: movq %rdx, %rsi
	; SSSE3-NEXT: sarq $32, %rsi
	; SSSE3-NEXT: andl $3, %edx
	; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSSE3-NEXT: andl $3, %esi
	; SSSE3-NEXT: andl $3, %eax
	; SSSE3-NEXT: andl $3, %ecx
	; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSSE3-NEXT: retq
	;
	; AVX-LABEL: var_shuffle_v4f32:
	; AVX: # %bb.0:
	; AVX-NEXT: vpextrq $1, %xmm1, %rax
	; AVX-NEXT: movq %rax, %rcx
	; AVX-NEXT: sarq $32, %rcx
	; AVX-NEXT: vmovq %xmm1, %rdx
	; AVX-NEXT: movq %rdx, %rsi
	; AVX-NEXT: sarq $32, %rsi
	; AVX-NEXT: andl $3, %edx
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $3, %esi
	; AVX-NEXT: andl $3, %eax
	; AVX-NEXT: andl $3, %ecx
	; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
	; AVX-NEXT: retq
	%index0 = extractelement <4 x i32> %indices, i32 0
	%index1 = extractelement <4 x i32> %indices, i32 1
	%index2 = extractelement <4 x i32> %indices, i32 2
	%index3 = extractelement <4 x i32> %indices, i32 3
	%v0 = extractelement <4 x float> %v, i32 %index0
	%v1 = extractelement <4 x float> %v, i32 %index1
	%v2 = extractelement <4 x float> %v, i32 %index2
	%v3 = extractelement <4 x float> %v, i32 %index3
	%ret0 = insertelement <4 x float> undef, float %v0, i32 0
	%ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
	%ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
	%ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
	ret <4 x float> %ret3
	}
	Index: vendor/llvm/dist-release_60/test/CodeGen/X86/var-permute-256.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/CodeGen/X86/var-permute-256.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/CodeGen/X86/var-permute-256.ll (revision 328362)
	@@ -1,1279 +1,1459 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f \| FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512F
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl \| FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512VL
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl \| FileCheck %s --check-prefixes=AVX,INT256,AVX512,AVX512VLBW
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi \| FileCheck %s --check-prefixes=AVX,INT256,AVX512,AVX512VLBW,VBMI

	define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
	; AVX1-LABEL: var_shuffle_v4i64:
	; AVX1: # %bb.0:
	; AVX1-NEXT: pushq %rbp
	; AVX1-NEXT: movq %rsp, %rbp
	; AVX1-NEXT: andq $-32, %rsp
	; AVX1-NEXT: subq $64, %rsp
	; AVX1-NEXT: vmovq %xmm1, %rax
	; AVX1-NEXT: andl $3, %eax
	; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX1-NEXT: andl $3, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vmovq %xmm1, %rdx
	; AVX1-NEXT: andl $3, %edx
	; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
	; AVX1-NEXT: andl $3, %esi
	; AVX1-NEXT: vmovaps %ymm0, (%rsp)
	; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: movq %rbp, %rsp
	; AVX1-NEXT: popq %rbp
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: var_shuffle_v4i64:
	; AVX2: # %bb.0:
	; AVX2-NEXT: pushq %rbp
	; AVX2-NEXT: movq %rsp, %rbp
	; AVX2-NEXT: andq $-32, %rsp
	; AVX2-NEXT: subq $64, %rsp
	; AVX2-NEXT: vmovq %xmm1, %rax
	; AVX2-NEXT: andl $3, %eax
	; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX2-NEXT: andl $3, %ecx
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
	; AVX2-NEXT: vmovq %xmm1, %rdx
	; AVX2-NEXT: andl $3, %edx
	; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
	; AVX2-NEXT: andl $3, %esi
	; AVX2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
	; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: movq %rbp, %rsp
	; AVX2-NEXT: popq %rbp
	; AVX2-NEXT: retq
	;
	; AVX512F-LABEL: var_shuffle_v4i64:
	; AVX512F: # %bb.0:
	; AVX512F-NEXT: pushq %rbp
	; AVX512F-NEXT: movq %rsp, %rbp
	; AVX512F-NEXT: andq $-32, %rsp
	; AVX512F-NEXT: subq $64, %rsp
	; AVX512F-NEXT: vmovq %xmm1, %rax
	; AVX512F-NEXT: andl $3, %eax
	; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX512F-NEXT: andl $3, %ecx
	; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
	; AVX512F-NEXT: vmovq %xmm1, %rdx
	; AVX512F-NEXT: andl $3, %edx
	; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
	; AVX512F-NEXT: andl $3, %esi
	; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
	; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
	; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX512F-NEXT: movq %rbp, %rsp
	; AVX512F-NEXT: popq %rbp
	; AVX512F-NEXT: retq
	;
	; AVX512VL-LABEL: var_shuffle_v4i64:
	; AVX512VL: # %bb.0:
	; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	;
	; AVX512VLBW-LABEL: var_shuffle_v4i64:
	; AVX512VLBW: # %bb.0:
	; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
	; AVX512VLBW-NEXT: retq
	%index0 = extractelement <4 x i64> %indices, i32 0
	%index1 = extractelement <4 x i64> %indices, i32 1
	%index2 = extractelement <4 x i64> %indices, i32 2
	%index3 = extractelement <4 x i64> %indices, i32 3
	%v0 = extractelement <4 x i64> %v, i64 %index0
	%v1 = extractelement <4 x i64> %v, i64 %index1
	%v2 = extractelement <4 x i64> %v, i64 %index2
	%v3 = extractelement <4 x i64> %v, i64 %index3
	%ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
	%ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
	%ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
	%ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
	ret <4 x i64> %ret3
	}

	define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
	; AVX1-LABEL: var_shuffle_v8i32:
	; AVX1: # %bb.0:
	; AVX1-NEXT: pushq %rbp
	; AVX1-NEXT: movq %rsp, %rbp
	; AVX1-NEXT: andq $-32, %rsp
	; AVX1-NEXT: subq $64, %rsp
	; AVX1-NEXT: vpextrq $1, %xmm1, %r8
	; AVX1-NEXT: movq %r8, %rcx
	; AVX1-NEXT: shrq $30, %rcx
	; AVX1-NEXT: vmovq %xmm1, %r9
	; AVX1-NEXT: movq %r9, %rsi
	; AVX1-NEXT: shrq $30, %rsi
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpextrq $1, %xmm1, %r10
	; AVX1-NEXT: movq %r10, %rdi
	; AVX1-NEXT: shrq $30, %rdi
	; AVX1-NEXT: vmovq %xmm1, %rax
	; AVX1-NEXT: movq %rax, %rdx
	; AVX1-NEXT: shrq $30, %rdx
	; AVX1-NEXT: vmovaps %ymm0, (%rsp)
	; AVX1-NEXT: andl $7, %r9d
	; AVX1-NEXT: andl $28, %esi
	; AVX1-NEXT: andl $7, %r8d
	; AVX1-NEXT: andl $28, %ecx
	; AVX1-NEXT: andl $7, %eax
	; AVX1-NEXT: andl $28, %edx
	; AVX1-NEXT: andl $7, %r10d
	; AVX1-NEXT: andl $28, %edi
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0
	; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
	; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0
	; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1
	; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
	; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: movq %rbp, %rsp
	; AVX1-NEXT: popq %rbp
	; AVX1-NEXT: retq
	;
	; INT256-LABEL: var_shuffle_v8i32:
	; INT256: # %bb.0:
	; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; INT256-NEXT: retq
	%index0 = extractelement <8 x i32> %indices, i32 0
	%index1 = extractelement <8 x i32> %indices, i32 1
	%index2 = extractelement <8 x i32> %indices, i32 2
	%index3 = extractelement <8 x i32> %indices, i32 3
	%index4 = extractelement <8 x i32> %indices, i32 4
	%index5 = extractelement <8 x i32> %indices, i32 5
	%index6 = extractelement <8 x i32> %indices, i32 6
	%index7 = extractelement <8 x i32> %indices, i32 7
	%v0 = extractelement <8 x i32> %v, i32 %index0
	%v1 = extractelement <8 x i32> %v, i32 %index1
	%v2 = extractelement <8 x i32> %v, i32 %index2
	%v3 = extractelement <8 x i32> %v, i32 %index3
	%v4 = extractelement <8 x i32> %v, i32 %index4
	%v5 = extractelement <8 x i32> %v, i32 %index5
	%v6 = extractelement <8 x i32> %v, i32 %index6
	%v7 = extractelement <8 x i32> %v, i32 %index7
	%ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
	%ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
	%ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
	%ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
	%ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
	%ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
	%ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
	%ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
	ret <8 x i32> %ret7
	}

	define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
	; AVX1-LABEL: var_shuffle_v16i16:
	; AVX1: # %bb.0:
	; AVX1-NEXT: pushq %rbp
	; AVX1-NEXT: movq %rsp, %rbp
	; AVX1-NEXT: andq $-32, %rsp
	; AVX1-NEXT: subq $64, %rsp
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vmovd %xmm2, %eax
	; AVX1-NEXT: vmovaps %ymm0, (%rsp)
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpextrw $1, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vpextrw $2, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vpextrw $3, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vpextrw $4, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vpextrw $5, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vpextrw $6, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vpextrw $7, %xmm2, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX1-NEXT: vmovd %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX1-NEXT: vmovd %eax, %xmm2
	; AVX1-NEXT: vpextrw $1, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX1-NEXT: vpextrw $2, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX1-NEXT: vpextrw $3, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX1-NEXT: vpextrw $4, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX1-NEXT: vpextrw $5, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX1-NEXT: vpextrw $6, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX1-NEXT: vpextrw $7, %xmm1, %eax
	; AVX1-NEXT: andl $15, %eax
	; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: movq %rbp, %rsp
	; AVX1-NEXT: popq %rbp
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: var_shuffle_v16i16:
	; AVX2: # %bb.0:
	; AVX2-NEXT: pushq %rbp
	; AVX2-NEXT: movq %rsp, %rbp
	; AVX2-NEXT: andq $-32, %rsp
	; AVX2-NEXT: subq $64, %rsp
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vmovd %xmm2, %eax
	; AVX2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpextrw $1, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vpextrw $2, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vpextrw $3, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vpextrw $4, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vpextrw $5, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vpextrw $6, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vpextrw $7, %xmm2, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX2-NEXT: vmovd %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX2-NEXT: vmovd %eax, %xmm2
	; AVX2-NEXT: vpextrw $1, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX2-NEXT: vpextrw $2, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX2-NEXT: vpextrw $3, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX2-NEXT: vpextrw $4, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX2-NEXT: vpextrw $5, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX2-NEXT: vpextrw $6, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX2-NEXT: vpextrw $7, %xmm1, %eax
	; AVX2-NEXT: andl $15, %eax
	; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: movq %rbp, %rsp
	; AVX2-NEXT: popq %rbp
	; AVX2-NEXT: retq
	;
	; AVX512F-LABEL: var_shuffle_v16i16:
	; AVX512F: # %bb.0:
	; AVX512F-NEXT: pushq %rbp
	; AVX512F-NEXT: movq %rsp, %rbp
	; AVX512F-NEXT: andq $-32, %rsp
	; AVX512F-NEXT: subq $64, %rsp
	; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX512F-NEXT: vmovd %xmm2, %eax
	; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX512F-NEXT: vmovd %eax, %xmm0
	; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512F-NEXT: vmovd %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX512F-NEXT: vmovd %eax, %xmm2
	; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
	; AVX512F-NEXT: andl $15, %eax
	; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
	; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX512F-NEXT: movq %rbp, %rsp
	; AVX512F-NEXT: popq %rbp
	; AVX512F-NEXT: retq
	;
	; AVX512VL-LABEL: var_shuffle_v16i16:
	; AVX512VL: # %bb.0:
	; AVX512VL-NEXT: pushq %rbp
	; AVX512VL-NEXT: movq %rsp, %rbp
	; AVX512VL-NEXT: andq $-32, %rsp
	; AVX512VL-NEXT: subq $64, %rsp
	; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX512VL-NEXT: vmovd %xmm2, %eax
	; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm0
	; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
	; AVX512VL-NEXT: vmovd %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm2
	; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
	; AVX512VL-NEXT: andl $15, %eax
	; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
	; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX512VL-NEXT: movq %rbp, %rsp
	; AVX512VL-NEXT: popq %rbp
	; AVX512VL-NEXT: retq
	;
	; AVX512VLBW-LABEL: var_shuffle_v16i16:
	; AVX512VLBW: # %bb.0:
	; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VLBW-NEXT: retq
	%index0 = extractelement <16 x i16> %indices, i32 0
	%index1 = extractelement <16 x i16> %indices, i32 1
	%index2 = extractelement <16 x i16> %indices, i32 2
	%index3 = extractelement <16 x i16> %indices, i32 3
	%index4 = extractelement <16 x i16> %indices, i32 4
	%index5 = extractelement <16 x i16> %indices, i32 5
	%index6 = extractelement <16 x i16> %indices, i32 6
	%index7 = extractelement <16 x i16> %indices, i32 7
	%index8 = extractelement <16 x i16> %indices, i32 8
	%index9 = extractelement <16 x i16> %indices, i32 9
	%index10 = extractelement <16 x i16> %indices, i32 10
	%index11 = extractelement <16 x i16> %indices, i32 11
	%index12 = extractelement <16 x i16> %indices, i32 12
	%index13 = extractelement <16 x i16> %indices, i32 13
	%index14 = extractelement <16 x i16> %indices, i32 14
	%index15 = extractelement <16 x i16> %indices, i32 15
	%v0 = extractelement <16 x i16> %v, i16 %index0
	%v1 = extractelement <16 x i16> %v, i16 %index1
	%v2 = extractelement <16 x i16> %v, i16 %index2
	%v3 = extractelement <16 x i16> %v, i16 %index3
	%v4 = extractelement <16 x i16> %v, i16 %index4
	%v5 = extractelement <16 x i16> %v, i16 %index5
	%v6 = extractelement <16 x i16> %v, i16 %index6
	%v7 = extractelement <16 x i16> %v, i16 %index7
	%v8 = extractelement <16 x i16> %v, i16 %index8
	%v9 = extractelement <16 x i16> %v, i16 %index9
	%v10 = extractelement <16 x i16> %v, i16 %index10
	%v11 = extractelement <16 x i16> %v, i16 %index11
	%v12 = extractelement <16 x i16> %v, i16 %index12
	%v13 = extractelement <16 x i16> %v, i16 %index13
	%v14 = extractelement <16 x i16> %v, i16 %index14
	%v15 = extractelement <16 x i16> %v, i16 %index15
	%ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
	%ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
	%ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
	%ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
	%ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
	%ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
	%ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
	%ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
	%ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
	%ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
	%ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
	%ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
	%ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
	%ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
	%ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
	%ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
	ret <16 x i16> %ret15
	}

	define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
	; AVX1-LABEL: var_shuffle_v32i8:
	; AVX1: # %bb.0:
	; AVX1-NEXT: pushq %rbp
	; AVX1-NEXT: movq %rsp, %rbp
	; AVX1-NEXT: andq $-32, %rsp
	; AVX1-NEXT: subq $64, %rsp
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpextrb $0, %xmm2, %eax
	; AVX1-NEXT: vmovaps %ymm0, (%rsp)
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpextrb $1, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $2, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $3, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $4, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $5, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $6, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $7, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $8, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $9, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $10, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $11, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $12, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $13, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $14, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $15, %xmm2, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
	; AVX1-NEXT: vpextrb $0, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vmovd %eax, %xmm2
	; AVX1-NEXT: vpextrb $1, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $2, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $3, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $4, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $5, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $6, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $7, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $8, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $9, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $10, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $11, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $12, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $13, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $14, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
	; AVX1-NEXT: vpextrb $15, %xmm1, %eax
	; AVX1-NEXT: andl $31, %eax
	; AVX1-NEXT: movzbl (%rsp,%rax), %eax
	; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: movq %rbp, %rsp
	; AVX1-NEXT: popq %rbp
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: var_shuffle_v32i8:
	; AVX2: # %bb.0:
	; AVX2-NEXT: pushq %rbp
	; AVX2-NEXT: movq %rsp, %rbp
	; AVX2-NEXT: andq $-32, %rsp
	; AVX2-NEXT: subq $64, %rsp
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vpextrb $0, %xmm2, %eax
	; AVX2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpextrb $1, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $2, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $3, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $4, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $5, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $6, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $7, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $8, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $9, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $10, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $11, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $12, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $13, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $14, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $15, %xmm2, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
	; AVX2-NEXT: vpextrb $0, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vmovd %eax, %xmm2
	; AVX2-NEXT: vpextrb $1, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $2, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $3, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $4, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $5, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $6, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $7, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $8, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $9, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $10, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $11, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $12, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $13, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $14, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
	; AVX2-NEXT: vpextrb $15, %xmm1, %eax
	; AVX2-NEXT: andl $31, %eax
	; AVX2-NEXT: movzbl (%rsp,%rax), %eax
	; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: movq %rbp, %rsp
	; AVX2-NEXT: popq %rbp
	; AVX2-NEXT: retq
	;
	; AVX512F-LABEL: var_shuffle_v32i8:
	; AVX512F: # %bb.0:
	; AVX512F-NEXT: pushq %rbp
	; AVX512F-NEXT: movq %rsp, %rbp
	; AVX512F-NEXT: andq $-32, %rsp
	; AVX512F-NEXT: subq $64, %rsp
	; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
	; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vmovd %eax, %xmm0
	; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
	; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vmovd %eax, %xmm2
	; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
	; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
	; AVX512F-NEXT: andl $31, %eax
	; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
	; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX512F-NEXT: movq %rbp, %rsp
	; AVX512F-NEXT: popq %rbp
	; AVX512F-NEXT: retq
	;
	; AVX512VL-LABEL: var_shuffle_v32i8:
	; AVX512VL: # %bb.0:
	; AVX512VL-NEXT: pushq %rbp
	; AVX512VL-NEXT: movq %rsp, %rbp
	; AVX512VL-NEXT: andq $-32, %rsp
	; AVX512VL-NEXT: subq $64, %rsp
	; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
	; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm0
	; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
	; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm2
	; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
	; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
	; AVX512VL-NEXT: andl $31, %eax
	; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
	; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
	; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX512VL-NEXT: movq %rbp, %rsp
	; AVX512VL-NEXT: popq %rbp
	; AVX512VL-NEXT: retq
	;
	; VBMI-LABEL: var_shuffle_v32i8:
	; VBMI: # %bb.0:
	; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
	; VBMI-NEXT: retq
	%index0 = extractelement <32 x i8> %indices, i32 0
	%index1 = extractelement <32 x i8> %indices, i32 1
	%index2 = extractelement <32 x i8> %indices, i32 2
	%index3 = extractelement <32 x i8> %indices, i32 3
	%index4 = extractelement <32 x i8> %indices, i32 4
	%index5 = extractelement <32 x i8> %indices, i32 5
	%index6 = extractelement <32 x i8> %indices, i32 6
	%index7 = extractelement <32 x i8> %indices, i32 7
	%index8 = extractelement <32 x i8> %indices, i32 8
	%index9 = extractelement <32 x i8> %indices, i32 9
	%index10 = extractelement <32 x i8> %indices, i32 10
	%index11 = extractelement <32 x i8> %indices, i32 11
	%index12 = extractelement <32 x i8> %indices, i32 12
	%index13 = extractelement <32 x i8> %indices, i32 13
	%index14 = extractelement <32 x i8> %indices, i32 14
	%index15 = extractelement <32 x i8> %indices, i32 15
	%index16 = extractelement <32 x i8> %indices, i32 16
	%index17 = extractelement <32 x i8> %indices, i32 17
	%index18 = extractelement <32 x i8> %indices, i32 18
	%index19 = extractelement <32 x i8> %indices, i32 19
	%index20 = extractelement <32 x i8> %indices, i32 20
	%index21 = extractelement <32 x i8> %indices, i32 21
	%index22 = extractelement <32 x i8> %indices, i32 22
	%index23 = extractelement <32 x i8> %indices, i32 23
	%index24 = extractelement <32 x i8> %indices, i32 24
	%index25 = extractelement <32 x i8> %indices, i32 25
	%index26 = extractelement <32 x i8> %indices, i32 26
	%index27 = extractelement <32 x i8> %indices, i32 27
	%index28 = extractelement <32 x i8> %indices, i32 28
	%index29 = extractelement <32 x i8> %indices, i32 29
	%index30 = extractelement <32 x i8> %indices, i32 30
	%index31 = extractelement <32 x i8> %indices, i32 31
	%v0 = extractelement <32 x i8> %v, i8 %index0
	%v1 = extractelement <32 x i8> %v, i8 %index1
	%v2 = extractelement <32 x i8> %v, i8 %index2
	%v3 = extractelement <32 x i8> %v, i8 %index3
	%v4 = extractelement <32 x i8> %v, i8 %index4
	%v5 = extractelement <32 x i8> %v, i8 %index5
	%v6 = extractelement <32 x i8> %v, i8 %index6
	%v7 = extractelement <32 x i8> %v, i8 %index7
	%v8 = extractelement <32 x i8> %v, i8 %index8
	%v9 = extractelement <32 x i8> %v, i8 %index9
	%v10 = extractelement <32 x i8> %v, i8 %index10
	%v11 = extractelement <32 x i8> %v, i8 %index11
	%v12 = extractelement <32 x i8> %v, i8 %index12
	%v13 = extractelement <32 x i8> %v, i8 %index13
	%v14 = extractelement <32 x i8> %v, i8 %index14
	%v15 = extractelement <32 x i8> %v, i8 %index15
	%v16 = extractelement <32 x i8> %v, i8 %index16
	%v17 = extractelement <32 x i8> %v, i8 %index17
	%v18 = extractelement <32 x i8> %v, i8 %index18
	%v19 = extractelement <32 x i8> %v, i8 %index19
	%v20 = extractelement <32 x i8> %v, i8 %index20
	%v21 = extractelement <32 x i8> %v, i8 %index21
	%v22 = extractelement <32 x i8> %v, i8 %index22
	%v23 = extractelement <32 x i8> %v, i8 %index23
	%v24 = extractelement <32 x i8> %v, i8 %index24
	%v25 = extractelement <32 x i8> %v, i8 %index25
	%v26 = extractelement <32 x i8> %v, i8 %index26
	%v27 = extractelement <32 x i8> %v, i8 %index27
	%v28 = extractelement <32 x i8> %v, i8 %index28
	%v29 = extractelement <32 x i8> %v, i8 %index29
	%v30 = extractelement <32 x i8> %v, i8 %index30
	%v31 = extractelement <32 x i8> %v, i8 %index31
	%ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
	%ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
	%ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
	%ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
	%ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
	%ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
	%ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
	%ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
	%ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
	%ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
	%ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
	%ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
	%ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
	%ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
	%ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
	%ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
	%ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
	%ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
	%ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
	%ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
	%ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
	%ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
	%ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
	%ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
	%ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
	%ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
	%ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
	%ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
	%ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
	%ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
	%ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
	%ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
	ret <32 x i8> %ret31
	}

	define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
	; AVX1-LABEL: var_shuffle_v4f64:
	; AVX1: # %bb.0:
	; AVX1-NEXT: pushq %rbp
	; AVX1-NEXT: movq %rsp, %rbp
	; AVX1-NEXT: andq $-32, %rsp
	; AVX1-NEXT: subq $64, %rsp
	; AVX1-NEXT: vmovq %xmm1, %rax
	; AVX1-NEXT: andl $3, %eax
	; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX1-NEXT: andl $3, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vmovq %xmm1, %rdx
	; AVX1-NEXT: andl $3, %edx
	; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
	; AVX1-NEXT: andl $3, %esi
	; AVX1-NEXT: vmovaps %ymm0, (%rsp)
	; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
	; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: movq %rbp, %rsp
	; AVX1-NEXT: popq %rbp
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: var_shuffle_v4f64:
	; AVX2: # %bb.0:
	; AVX2-NEXT: pushq %rbp
	; AVX2-NEXT: movq %rsp, %rbp
	; AVX2-NEXT: andq $-32, %rsp
	; AVX2-NEXT: subq $64, %rsp
	; AVX2-NEXT: vmovq %xmm1, %rax
	; AVX2-NEXT: andl $3, %eax
	; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX2-NEXT: andl $3, %ecx
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
	; AVX2-NEXT: vmovq %xmm1, %rdx
	; AVX2-NEXT: andl $3, %edx
	; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
	; AVX2-NEXT: andl $3, %esi
	; AVX2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
	; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
	; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: movq %rbp, %rsp
	; AVX2-NEXT: popq %rbp
	; AVX2-NEXT: retq
	;
	; AVX512F-LABEL: var_shuffle_v4f64:
	; AVX512F: # %bb.0:
	; AVX512F-NEXT: pushq %rbp
	; AVX512F-NEXT: movq %rsp, %rbp
	; AVX512F-NEXT: andq $-32, %rsp
	; AVX512F-NEXT: subq $64, %rsp
	; AVX512F-NEXT: vmovq %xmm1, %rax
	; AVX512F-NEXT: andl $3, %eax
	; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX512F-NEXT: andl $3, %ecx
	; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
	; AVX512F-NEXT: vmovq %xmm1, %rdx
	; AVX512F-NEXT: andl $3, %edx
	; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
	; AVX512F-NEXT: andl $3, %esi
	; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
	; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
	; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
	; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
	; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX512F-NEXT: movq %rbp, %rsp
	; AVX512F-NEXT: popq %rbp
	; AVX512F-NEXT: retq
	;
	; AVX512VL-LABEL: var_shuffle_v4f64:
	; AVX512VL: # %bb.0:
	; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	;
	; AVX512VLBW-LABEL: var_shuffle_v4f64:
	; AVX512VLBW: # %bb.0:
	; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
	; AVX512VLBW-NEXT: retq
	%index0 = extractelement <4 x i64> %indices, i32 0
	%index1 = extractelement <4 x i64> %indices, i32 1
	%index2 = extractelement <4 x i64> %indices, i32 2
	%index3 = extractelement <4 x i64> %indices, i32 3
	%v0 = extractelement <4 x double> %v, i64 %index0
	%v1 = extractelement <4 x double> %v, i64 %index1
	%v2 = extractelement <4 x double> %v, i64 %index2
	%v3 = extractelement <4 x double> %v, i64 %index3
	%ret0 = insertelement <4 x double> undef, double %v0, i32 0
	%ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
	%ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
	%ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
	ret <4 x double> %ret3
	}

	define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
	; AVX1-LABEL: var_shuffle_v8f32:
	; AVX1: # %bb.0:
	; AVX1-NEXT: pushq %rbp
	; AVX1-NEXT: movq %rsp, %rbp
	; AVX1-NEXT: andq $-32, %rsp
	; AVX1-NEXT: subq $64, %rsp
	; AVX1-NEXT: vpextrq $1, %xmm1, %r8
	; AVX1-NEXT: movq %r8, %rcx
	; AVX1-NEXT: shrq $30, %rcx
	; AVX1-NEXT: vmovq %xmm1, %r9
	; AVX1-NEXT: movq %r9, %rdx
	; AVX1-NEXT: shrq $30, %rdx
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpextrq $1, %xmm1, %r10
	; AVX1-NEXT: movq %r10, %rdi
	; AVX1-NEXT: shrq $30, %rdi
	; AVX1-NEXT: vmovq %xmm1, %rax
	; AVX1-NEXT: movq %rax, %rsi
	; AVX1-NEXT: shrq $30, %rsi
	; AVX1-NEXT: vmovaps %ymm0, (%rsp)
	; AVX1-NEXT: andl $7, %r9d
	; AVX1-NEXT: andl $28, %edx
	; AVX1-NEXT: andl $7, %r8d
	; AVX1-NEXT: andl $28, %ecx
	; AVX1-NEXT: andl $7, %eax
	; AVX1-NEXT: andl $28, %esi
	; AVX1-NEXT: andl $7, %r10d
	; AVX1-NEXT: andl $28, %edi
	; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
	; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
	; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
	; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: movq %rbp, %rsp
	; AVX1-NEXT: popq %rbp
	; AVX1-NEXT: retq
	;
	; INT256-LABEL: var_shuffle_v8f32:
	; INT256: # %bb.0:
	; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; INT256-NEXT: retq
	%index0 = extractelement <8 x i32> %indices, i32 0
	%index1 = extractelement <8 x i32> %indices, i32 1
	%index2 = extractelement <8 x i32> %indices, i32 2
	%index3 = extractelement <8 x i32> %indices, i32 3
	%index4 = extractelement <8 x i32> %indices, i32 4
	%index5 = extractelement <8 x i32> %indices, i32 5
	%index6 = extractelement <8 x i32> %indices, i32 6
	%index7 = extractelement <8 x i32> %indices, i32 7
	%v0 = extractelement <8 x float> %v, i32 %index0
	%v1 = extractelement <8 x float> %v, i32 %index1
	%v2 = extractelement <8 x float> %v, i32 %index2
	%v3 = extractelement <8 x float> %v, i32 %index3
	%v4 = extractelement <8 x float> %v, i32 %index4
	%v5 = extractelement <8 x float> %v, i32 %index5
	%v6 = extractelement <8 x float> %v, i32 %index6
	%v7 = extractelement <8 x float> %v, i32 %index7
	%ret0 = insertelement <8 x float> undef, float %v0, i32 0
	%ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
	%ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
	%ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
	%ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
	%ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
	%ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
	%ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
	ret <8 x float> %ret7
	}
	+
	+define <8 x i32> @pr35820(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
	+; AVX1-LABEL: pr35820:
	+; AVX1: # %bb.0: # %entry
	+; AVX1-NEXT: vpextrq $1, %xmm1, %r8
	+; AVX1-NEXT: movq %r8, %r10
	+; AVX1-NEXT: shrq $30, %r10
	+; AVX1-NEXT: vmovq %xmm1, %r9
	+; AVX1-NEXT: movq %r9, %rsi
	+; AVX1-NEXT: shrq $30, %rsi
	+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	+; AVX1-NEXT: andl $3, %r9d
	+; AVX1-NEXT: andl $12, %esi
	+; AVX1-NEXT: andl $3, %r8d
	+; AVX1-NEXT: andl $12, %r10d
	+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
	+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
	+; AVX1-NEXT: movq %rax, %rdi
	+; AVX1-NEXT: shrq $30, %rdi
	+; AVX1-NEXT: vmovq %xmm0, %rcx
	+; AVX1-NEXT: movq %rcx, %rdx
	+; AVX1-NEXT: shrq $30, %rdx
	+; AVX1-NEXT: andl $3, %ecx
	+; AVX1-NEXT: andl $12, %edx
	+; AVX1-NEXT: andl $3, %eax
	+; AVX1-NEXT: andl $12, %edi
	+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	+; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0
	+; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
	+; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0
	+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
	+; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1
	+; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1
	+; AVX1-NEXT: vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1
	+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	+; AVX1-NEXT: retq
	+;
	+; INT256-LABEL: pr35820:
	+; INT256: # %bb.0: # %entry
	+; INT256-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
	+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
	+; INT256-NEXT: retq
	+entry:
	+ %tmp1 = extractelement <8 x i32> %indices, i32 0
	+ %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
	+ %tmp2 = extractelement <8 x i32> %indices, i32 1
	+ %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
	+ %tmp3 = extractelement <8 x i32> %indices, i32 2
	+ %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
	+ %tmp4 = extractelement <8 x i32> %indices, i32 3
	+ %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
	+ %tmp5 = extractelement <8 x i32> %indices, i32 4
	+ %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
	+ %tmp6 = extractelement <8 x i32> %indices, i32 5
	+ %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
	+ %tmp7 = extractelement <8 x i32> %indices, i32 6
	+ %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
	+ %tmp8 = extractelement <8 x i32> %indices, i32 7
	+ %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
	+ %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
	+ %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
	+ %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
	+ %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
	+ %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
	+ %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
	+ %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
	+ %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
	+ ret <8 x i32> %tmp16
	+}
	+
	+define <8 x float> @pr35820_float(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
	+; AVX1-LABEL: pr35820_float:
	+; AVX1: # %bb.0: # %entry
	+; AVX1-NEXT: vpextrq $1, %xmm1, %r8
	+; AVX1-NEXT: movq %r8, %r10
	+; AVX1-NEXT: shrq $30, %r10
	+; AVX1-NEXT: vmovq %xmm1, %r9
	+; AVX1-NEXT: movq %r9, %rdx
	+; AVX1-NEXT: shrq $30, %rdx
	+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	+; AVX1-NEXT: andl $3, %r9d
	+; AVX1-NEXT: andl $12, %edx
	+; AVX1-NEXT: andl $3, %r8d
	+; AVX1-NEXT: andl $12, %r10d
	+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
	+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
	+; AVX1-NEXT: movq %rax, %rdi
	+; AVX1-NEXT: shrq $30, %rdi
	+; AVX1-NEXT: vmovq %xmm0, %rcx
	+; AVX1-NEXT: movq %rcx, %rsi
	+; AVX1-NEXT: shrq $30, %rsi
	+; AVX1-NEXT: andl $3, %ecx
	+; AVX1-NEXT: andl $12, %esi
	+; AVX1-NEXT: andl $3, %eax
	+; AVX1-NEXT: andl $12, %edi
	+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
	+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
	+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
	+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
	+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	+; AVX1-NEXT: retq
	+;
	+; INT256-LABEL: pr35820_float:
	+; INT256: # %bb.0: # %entry
	+; INT256-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
	+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
	+; INT256-NEXT: retq
	+entry:
	+ %tmp1 = extractelement <8 x i32> %indices, i32 0
	+ %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
	+ %tmp2 = extractelement <8 x i32> %indices, i32 1
	+ %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
	+ %tmp3 = extractelement <8 x i32> %indices, i32 2
	+ %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
	+ %tmp4 = extractelement <8 x i32> %indices, i32 3
	+ %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
	+ %tmp5 = extractelement <8 x i32> %indices, i32 4
	+ %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
	+ %tmp6 = extractelement <8 x i32> %indices, i32 5
	+ %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
	+ %tmp7 = extractelement <8 x i32> %indices, i32 6
	+ %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
	+ %tmp8 = extractelement <8 x i32> %indices, i32 7
	+ %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
	+ %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
	+ %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
	+ %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
	+ %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
	+ %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
	+ %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
	+ %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
	+ %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
	+ ret <8 x float> %tmp16
	+}
	+
	+define <4 x i32> @big_source(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
	+; AVX-LABEL: big_source:
	+; AVX: # %bb.0: # %entry
	+; AVX-NEXT: pushq %rbp
	+; AVX-NEXT: movq %rsp, %rbp
	+; AVX-NEXT: andq $-32, %rsp
	+; AVX-NEXT: subq $64, %rsp
	+; AVX-NEXT: vmovq %xmm1, %rax
	+; AVX-NEXT: movq %rax, %rcx
	+; AVX-NEXT: shrq $30, %rcx
	+; AVX-NEXT: andl $28, %ecx
	+; AVX-NEXT: vpextrq $1, %xmm1, %rdx
	+; AVX-NEXT: movq %rdx, %rsi
	+; AVX-NEXT: sarq $32, %rsi
	+; AVX-NEXT: andl $7, %eax
	+; AVX-NEXT: andl $7, %edx
	+; AVX-NEXT: vmovaps %ymm0, (%rsp)
	+; AVX-NEXT: andl $7, %esi
	+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	+; AVX-NEXT: vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0
	+; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
	+; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
	+; AVX-NEXT: movq %rbp, %rsp
	+; AVX-NEXT: popq %rbp
	+; AVX-NEXT: vzeroupper
	+; AVX-NEXT: retq
	+entry:
	+ %tmp1 = extractelement <4 x i32> %indices, i32 0
	+ %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
	+ %tmp2 = extractelement <4 x i32> %indices, i32 1
	+ %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
	+ %tmp3 = extractelement <4 x i32> %indices, i32 2
	+ %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
	+ %tmp4 = extractelement <4 x i32> %indices, i32 3
	+ %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
	+ %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
	+ %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
	+ %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
	+ %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
	+ ret <4 x i32> %tmp12
	+}
	Index: vendor/llvm/dist-release_60/test/MC/COFF/cv-inline-linetable.s
	===================================================================
	--- vendor/llvm/dist-release_60/test/MC/COFF/cv-inline-linetable.s (revision 328361)
	+++ vendor/llvm/dist-release_60/test/MC/COFF/cv-inline-linetable.s (revision 328362)
	@@ -1,137 +1,163 @@
	# RUN: llvm-mc -triple=i686-pc-win32 -filetype=obj < %s \| llvm-readobj -codeview \| FileCheck %s
	.text
	.def @feat.00;
	.scl 3;
	.type 0;
	.endef
	.globl @feat.00
	@feat.00 = 1
	.def "?baz@@YAXXZ";
	.scl 2;
	.type 32;
	.endef
	.globl "?baz@@YAXXZ"
	.p2align 4, 0x90
	"?baz@@YAXXZ": # @"\01?baz@@YAXXZ"
	Lfunc_begin0:
	.cv_file 1 "D:\\src\\llvm\\build\\t.cpp"
	.cv_func_id 0
	.cv_inline_site_id 1 within 0 inlined_at 1 15 3
	.cv_inline_site_id 2 within 1 inlined_at 1 10 3
	.cv_loc 0 1 13 0 is_stmt 0 # t.cpp:13:0
	# %bb.0: # %entry
	pushl %eax
	.cv_loc 0 1 14 5 # t.cpp:14:5
	addl $6, "?x@@3HC"
	.cv_loc 1 1 9 5 # t.cpp:9:5
	addl $4, "?x@@3HC"
	.cv_loc 2 1 3 7 # t.cpp:3:7
	movl $1, (%esp)
	leal (%esp), %eax
	.cv_loc 2 1 4 5 # t.cpp:4:5
	addl %eax, "?x@@3HC"
	.cv_loc 2 1 5 5 # t.cpp:5:5
	addl $2, "?x@@3HC"
	.cv_loc 2 1 6 5 # t.cpp:6:5
	addl $3, "?x@@3HC"
	.cv_loc 1 1 11 5 # t.cpp:11:5
	addl $5, "?x@@3HC"
	.cv_loc 0 1 16 5 # t.cpp:16:5
	addl $7, "?x@@3HC"
	.cv_loc 0 1 17 1 # t.cpp:17:1
	popl %eax
	retl
	Lfunc_end0:

	.section .debug$T,"dr"
	.long 4
	.short 6
	.short 4609
	.long 0
	.short 14
	.short 4104
	.asciz "\003\000\000\000\000\000\000\000\000\020\000"
	.short 14
	.short 5633
	.asciz "\000\000\000\000\001\020\000"
	.ascii "baz"
	.byte 0
	.short 14
	.short 5633
	.asciz "\000\000\000\000\001\020\000"
	.ascii "bar"
	.byte 0
	.short 14
	.short 5633
	.asciz "\000\000\000\000\001\020\000"
	.ascii "foo"
	.byte 0
	.section .debug$S,"dr"
	.long 4
	.long 241 # Symbol subsection for baz
	.long Ltmp1-Ltmp0
	Ltmp0:
	.short Ltmp3-Ltmp2
	Ltmp2:
	.short 4423
	.zero 12
	.long Lfunc_end0-"?baz@@YAXXZ"
	.zero 12
	.secrel32 "?baz@@YAXXZ"
	.secidx "?baz@@YAXXZ"
	.byte 0
	.ascii "baz"
	.byte 0
	Ltmp3:
	.short Ltmp5-Ltmp4
	Ltmp4:
	.short 4429
	.asciz "\000\000\000\000\000\000\000\000\003\020\000"
	.cv_inline_linetable 1 1 9 Lfunc_begin0 Lfunc_end0
	# CHECK: InlineSiteSym {
	# CHECK: PtrParent: 0x0
	# CHECK: PtrEnd: 0x0
	# CHECK: Inlinee: bar (0x1003)
	# CHECK: BinaryAnnotations [
	# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x8, LineOffset: 0}
	# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x7, LineOffset: 1}
	# CHECK-NEXT: ChangeLineOffset: 1
	# CHECK-NEXT: ChangeCodeOffset: 0x1E
	# CHECK-NEXT: ChangeCodeLength: 0x7
	# CHECK-NEXT: ]
	# CHECK: }
	Ltmp5:
	.short Ltmp7-Ltmp6
	Ltmp6:
	.short 4429
	.asciz "\000\000\000\000\000\000\000\000\004\020\000"
	.cv_inline_linetable 2 1 3 Lfunc_begin0 Lfunc_end0
	# CHECK: InlineSiteSym {
	# CHECK: PtrParent: 0x0
	# CHECK: PtrEnd: 0x0
	# CHECK: Inlinee: foo (0x1004)
	# CHECK: BinaryAnnotations [
	# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xF, LineOffset: 0}
	# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xA, LineOffset: 1}
	# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x6, LineOffset: 1}
	# CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x7, LineOffset: 1}
	# CHECK-NEXT: ChangeCodeLength: 0x7
	# CHECK-NEXT: ]
	# CHECK: }
	Ltmp7:
	.short 2
	.short 4430
	# CHECK: InlineSiteEnd {
	# CHECK: }
	.short 2
	.short 4430
	# CHECK: InlineSiteEnd {
	# CHECK: }
	.short 2
	.short 4431
	Ltmp1:
	.p2align 2
	.cv_linetable 0, "?baz@@YAXXZ", Lfunc_end0
	.cv_filechecksums # File index to string table offset subsection
	.cv_stringtable # String table

	+# CHECK-LABEL: FunctionLineTable [
	+# CHECK: LinkageName: ?baz@@YAXXZ
	+# CHECK: Flags: 0x1
	+# CHECK: CodeSize: 0x3D
	+# CHECK: FilenameSegment [
	+# CHECK: Filename: D:\src\llvm\build\t.cpp (0x0)
	+# CHECK: +0x0 [
	+# CHECK: LineNumberStart: 13
	+# CHECK: ]
	+# CHECK: +0x1 [
	+# CHECK: LineNumberStart: 14
	+# CHECK: ]
	+# CHECK: +0x8 [
	+# CHECK: LineNumberStart: 15
	+# CHECK: ]
	+# There shouldn't be any other line number entries because all the other
	+# .cv_locs are on line 15 where the top-level inline call site is.
	+# CHECK-NOT: LineNumberStart
	+# CHECK: +0x34 [
	+# CHECK: LineNumberStart: 16
	+# CHECK: ]
	+# CHECK: +0x3B [
	+# CHECK: LineNumberStart: 17
	+# CHECK: ]
	+# CHECK: ]
	+# CHECK: ]
	Index: vendor/llvm/dist-release_60/test/MC/X86/x86-64.s
	===================================================================
	--- vendor/llvm/dist-release_60/test/MC/X86/x86-64.s (revision 328361)
	+++ vendor/llvm/dist-release_60/test/MC/X86/x86-64.s (revision 328362)
	@@ -1,1559 +1,1595 @@
	// RUN: llvm-mc -triple x86_64-unknown-unknown -show-encoding %s > %t 2> %t.err
	// RUN: FileCheck < %t %s
	// RUN: FileCheck --check-prefix=CHECK-STDERR < %t.err %s

	monitor
	// CHECK: monitor
	// CHECK: encoding: [0x0f,0x01,0xc8]
	monitor %rax, %rcx, %rdx
	// CHECK: monitor
	// CHECK: encoding: [0x0f,0x01,0xc8]
	mwait
	// CHECK: mwait
	// CHECK: encoding: [0x0f,0x01,0xc9]
	mwait %rax, %rcx
	// CHECK: mwait
	// CHECK: encoding: [0x0f,0x01,0xc9]

	// Suffix inference:

	// CHECK: addl $0, %eax
	add $0, %eax
	// CHECK: addb $255, %al
	add $0xFF, %al
	// CHECK: orq %rax, %rdx
	or %rax, %rdx
	// CHECK: shlq $3, %rax
	shl $3, %rax


	// CHECK: subb %al, %al
	subb %al, %al

	// CHECK: addl $24, %eax
	addl $24, %eax

	// CHECK: movl %eax, 10(%ebp)
	movl %eax, 10(%ebp)
	// CHECK: movl %eax, 10(%ebp,%ebx)
	movl %eax, 10(%ebp, %ebx)
	// CHECK: movl %eax, 10(%ebp,%ebx,4)
	movl %eax, 10(%ebp, %ebx, 4)
	// CHECK: movl %eax, 10(,%ebx,4)
	movl %eax, 10(, %ebx, 4)

	// CHECK: movl 0, %eax
	movl 0, %eax
	// CHECK: movl $0, %eax
	movl $0, %eax

	// CHECK: ret
	ret

	// CHECK: retw
	retw

	// FIXME: Check that this matches SUB32ri8
	// CHECK: subl $1, %eax
	subl $1, %eax

	// FIXME: Check that this matches SUB32ri8
	// CHECK: subl $-1, %eax
	subl $-1, %eax

	// FIXME: Check that this matches SUB32ri
	// CHECK: subl $256, %eax
	subl $256, %eax

	// FIXME: Check that this matches XOR64ri8
	// CHECK: xorq $1, %rax
	xorq $1, %rax

	// FIXME: Check that this matches XOR64ri32
	// CHECK: xorq $256, %rax
	xorq $256, %rax

	// FIXME: Check that this matches SUB8rr
	// CHECK: subb %al, %bl
	subb %al, %bl

	// FIXME: Check that this matches SUB16rr
	// CHECK: subw %ax, %bx
	subw %ax, %bx

	// FIXME: Check that this matches SUB32rr
	// CHECK: subl %eax, %ebx
	subl %eax, %ebx

	// FIXME: Check that this matches the correct instruction.
	// CHECK: callq *%rax
	call *%rax

	// FIXME: Check that this matches the correct instruction.
	// CHECK: shldl %cl, %eax, %ebx
	shldl %cl, %eax, %ebx

	// CHECK: shll $2, %eax
	shll $2, %eax

	// CHECK: shll $2, %eax
	sall $2, %eax

	-// CHECK: rep movsb
	+// CHECK: rep
	+// CHECK-NEXT: movsb
	rep # comment
	movsb

	// CHECK: rep
	// CHECK: insb
	rep;insb

	// CHECK: rep
	// CHECK: outsb
	rep;outsb

	// CHECK: rep
	// CHECK: movsb
	rep;movsb


	// rdar://8470918
	smovb // CHECK: movsb
	smovw // CHECK: movsw
	smovl // CHECK: movsl
	smovq // CHECK: movsq

	// rdar://8456361
	// CHECK: rep
	// CHECK: movsl
	rep movsd

	// CHECK: rep
	// CHECK: lodsb
	rep;lodsb

	// CHECK: rep
	// CHECK: stosb
	rep;stosb

	// NOTE: repz and repe have the same opcode as rep
	// CHECK: rep
	// CHECK: cmpsb
	repz;cmpsb

	// NOTE: repnz has the same opcode as repne
	// CHECK: repne
	// CHECK: cmpsb
	repnz;cmpsb

	// NOTE: repe and repz have the same opcode as rep
	// CHECK: rep
	// CHECK: scasb
	repe;scasb

	// CHECK: repne
	// CHECK: scasb
	repne;scasb

	// CHECK: lock
	// CHECK: cmpxchgb %al, (%ebx)
	lock;cmpxchgb %al, 0(%ebx)

	// CHECK: cs
	// CHECK: movb (%eax), %al
	cs;movb 0(%eax), %al

	// CHECK: ss
	// CHECK: movb (%eax), %al
	ss;movb 0(%eax), %al

	// CHECK: ds
	// CHECK: movb (%eax), %al
	ds;movb 0(%eax), %al

	// CHECK: es
	// CHECK: movb (%eax), %al
	es;movb 0(%eax), %al

	// CHECK: fs
	// CHECK: movb (%eax), %al
	fs;movb 0(%eax), %al

	// CHECK: gs
	// CHECK: movb (%eax), %al
	gs;movb 0(%eax), %al

	// CHECK: fadd %st(0)
	// CHECK: fadd %st(1)
	// CHECK: fadd %st(7)

	fadd %st(0)
	fadd %st(1)
	fadd %st(7)

	// CHECK: leal 0, %eax
	leal 0, %eax

	// rdar://7986634 - Insensitivity on opcodes.
	// CHECK: int3
	INT3

	// rdar://8735979 - int $3 -> int3
	// CHECK: int3
	int $3


	// Allow scale factor without index register.
	// CHECK: movaps %xmm3, (%esi)
	// CHECK-STDERR: warning: scale factor without index register is ignored
	movaps %xmm3, (%esi, 2)

	// CHECK: imull $12, %eax
	imul $12, %eax

	// CHECK: imull %ecx, %eax
	imull %ecx, %eax


	// rdar://8208481
	// CHECK: outb %al, $161
	outb %al, $161
	// CHECK: outw %ax, $128
	outw %ax, $128
	// CHECK: inb $161, %al
	inb $161, %al

	// rdar://8017621
	// CHECK: pushq $1
	push $1

	// rdar://9716860
	pushq $1
	// CHECK: encoding: [0x6a,0x01]
	pushq $1111111
	// CHECK: encoding: [0x68,0x47,0xf4,0x10,0x00]

	// rdar://8017530
	// CHECK: sldtw 4
	sldt 4

	// rdar://8208499
	// CHECK: cmovnew %bx, %ax
	cmovnz %bx, %ax
	// CHECK: cmovneq %rbx, %rax
	cmovnzq %rbx, %rax


	// rdar://8407928
	// CHECK: inb $127, %al
	// CHECK: inw %dx, %ax
	// CHECK: outb %al, $127
	// CHECK: outw %ax, %dx
	// CHECK: inl %dx, %eax
	inb $0x7f
	inw %dx
	outb $0x7f
	outw %dx
	inl %dx


	// PR8114
	// CHECK: outb %al, %dx
	// CHECK: outb %al, %dx
	// CHECK: outw %ax, %dx
	// CHECK: outw %ax, %dx
	// CHECK: outl %eax, %dx
	// CHECK: outl %eax, %dx

	out %al, (%dx)
	outb %al, (%dx)
	out %ax, (%dx)
	outw %ax, (%dx)
	out %eax, (%dx)
	outl %eax, (%dx)

	// CHECK: inb %dx, %al
	// CHECK: inb %dx, %al
	// CHECK: inw %dx, %ax
	// CHECK: inw %dx, %ax
	// CHECK: inl %dx, %eax
	// CHECK: inl %dx, %eax

	in (%dx), %al
	inb (%dx), %al
	in (%dx), %ax
	inw (%dx), %ax
	in (%dx), %eax
	inl (%dx), %eax

	//PR15455

	// permitted invalid memory forms
	outs (%rsi), (%dx)
	// CHECK: outsw (%rsi), %dx
	outsb (%rsi), (%dx)
	// CHECK: outsb (%rsi), %dx
	outsw (%rsi), (%dx)
	// CHECK: outsw (%rsi), %dx
	outsl (%rsi), (%dx)
	// CHECK: outsl (%rsi), %dx

	ins (%dx), %es:(%rdi)
	// CHECK: insw %dx, %es:(%rdi)
	insb (%dx), %es:(%rdi)
	// CHECK: insb %dx, %es:(%rdi)
	insw (%dx), %es:(%rdi)
	// CHECK: insw %dx, %es:(%rdi)
	insl (%dx), %es:(%rdi)
	// CHECK: insl %dx, %es:(%rdi)

	// rdar://8431422

	// CHECK: fxch %st(1)
	// CHECK: fucom %st(1)
	// CHECK: fucomp %st(1)
	// CHECK: faddp %st(1)
	// CHECK: faddp %st(0)
	// CHECK: fsubp %st(1)
	// CHECK: fsubrp %st(1)
	// CHECK: fmulp %st(1)
	// CHECK: fdivp %st(1)
	// CHECK: fdivrp %st(1)

	fxch
	fucom
	fucomp
	faddp
	faddp %st
	fsubp
	fsubrp
	fmulp
	fdivp
	fdivrp

	// CHECK: fcomi %st(1)
	// CHECK: fcomi %st(2)
	// CHECK: fucomi %st(1)
	// CHECK: fucomi %st(2)
	// CHECK: fucomi %st(2)

	fcomi
	fcomi %st(2)
	fucomi
	fucomi %st(2)
	fucomi %st(2), %st

	// CHECK: fnstsw %ax
	// CHECK: fnstsw %ax
	// CHECK: fnstsw %ax
	// CHECK: fnstsw %ax

	fnstsw
	fnstsw %ax
	fnstsw %eax
	fnstsw %al

	// rdar://8431880
	// CHECK: rclb %bl
	// CHECK: rcll 3735928559(%ebx,%ecx,8)
	// CHECK: rcrl %ecx
	// CHECK: rcrl 305419896
	rcl %bl
	rcll 0xdeadbeef(%ebx,%ecx,8)
	rcr %ecx
	rcrl 0x12345678

	rclb %bl // CHECK: rclb %bl # encoding: [0xd0,0xd3]
	rclb $1, %bl // CHECK: rclb %bl # encoding: [0xd0,0xd3]
	rclb $2, %bl // CHECK: rclb $2, %bl # encoding: [0xc0,0xd3,0x02]

	// rdar://8418316
	// PR12173
	// CHECK: shldw %cl, %bx, %dx
	// CHECK: shldw %cl, %bx, %dx
	// CHECK: shldw $1, %bx, %dx
	// CHECK: shldw %cl, %bx, (%rax)
	// CHECK: shldw %cl, %bx, (%rax)
	// CHECK: shrdw %cl, %bx, %dx
	// CHECK: shrdw %cl, %bx, %dx
	// CHECK: shrdw $1, %bx, %dx
	// CHECK: shrdw %cl, %bx, (%rax)
	// CHECK: shrdw %cl, %bx, (%rax)

	shld %bx, %dx
	shld %cl, %bx, %dx
	shld $1, %bx, %dx
	shld %bx, (%rax)
	shld %cl, %bx, (%rax)
	shrd %bx, %dx
	shrd %cl, %bx, %dx
	shrd $1, %bx, %dx
	shrd %bx, (%rax)
	shrd %cl, %bx, (%rax)

	// CHECK: sldtl %ecx
	// CHECK: encoding: [0x0f,0x00,0xc1]
	// CHECK: sldtw %cx
	// CHECK: encoding: [0x66,0x0f,0x00,0xc1]

	sldt %ecx
	sldt %cx

	// CHECK: lcalll *3135175374
	// CHECK: ljmpl *3135175374
	// CHECK: lcalll *(%rax)
	// CHECK: ljmpl *(%rax)
	lcall *0xbadeface
	ljmp *0xbadeface
	lcall *(%rax)
	ljmpl *(%rax)

	// rdar://8444631
	// CHECK: enter $31438, $0
	// CHECK: encoding: [0xc8,0xce,0x7a,0x00]
	// CHECK: enter $31438, $1
	// CHECK: encoding: [0xc8,0xce,0x7a,0x01]
	// CHECK: enter $31438, $127
	// CHECK: encoding: [0xc8,0xce,0x7a,0x7f]
	enter $0x7ace,$0
	enter $0x7ace,$1
	enter $0x7ace,$0x7f


	// rdar://8456364
	// CHECK: movw %cs, %ax
	mov %cs, %ax

	// rdar://8456391
	fcmovb %st(1), %st(0) // CHECK: fcmovb %st(1), %st(0)
	fcmove %st(1), %st(0) // CHECK: fcmove %st(1), %st(0)
	fcmovbe %st(1), %st(0) // CHECK: fcmovbe %st(1), %st(0)
	fcmovu %st(1), %st(0) // CHECK: fcmovu %st(1), %st(0)

	fcmovnb %st(1), %st(0) // CHECK: fcmovnb %st(1), %st(0)
	fcmovne %st(1), %st(0) // CHECK: fcmovne %st(1), %st(0)
	fcmovnbe %st(1), %st(0) // CHECK: fcmovnbe %st(1), %st(0)
	fcmovnu %st(1), %st(0) // CHECK: fcmovnu %st(1), %st(0)

	fcmovnae %st(1), %st(0) // CHECK: fcmovb %st(1), %st(0)
	fcmovna %st(1), %st(0) // CHECK: fcmovbe %st(1), %st(0)

	fcmovae %st(1), %st(0) // CHECK: fcmovnb %st(1), %st(0)
	fcmova %st(1), %st(0) // CHECK: fcmovnbe %st(1), %st(0)

	// rdar://8456417
	.byte (88 + 1) & 15 // CHECK: .byte 9

	// rdar://8456412
	mov %rdx, %cr0
	// CHECK: movq %rdx, %cr0
	// CHECK: encoding: [0x0f,0x22,0xc2]
	mov %rdx, %cr4
	// CHECK: movq %rdx, %cr4
	// CHECK: encoding: [0x0f,0x22,0xe2]
	mov %rdx, %cr8
	// CHECK: movq %rdx, %cr8
	// CHECK: encoding: [0x44,0x0f,0x22,0xc2]
	mov %rdx, %cr15
	// CHECK: movq %rdx, %cr15
	// CHECK: encoding: [0x44,0x0f,0x22,0xfa]
	mov %rdx, %dr15
	// CHECK: movq %rdx, %dr15
	// CHECK: encoding: [0x44,0x0f,0x23,0xfa]
	mov %rdx, %db15
	// CHECK: movq %rdx, %dr15
	// CHECK: encoding: [0x44,0x0f,0x23,0xfa]

	// rdar://8456371 - Handle commutable instructions written backward.
	// CHECK: faddp %st(1)
	// CHECK: fmulp %st(2)
	faddp %st, %st(1)
	fmulp %st, %st(2)

	// rdar://8468087 - Encode these accurately, they are not synonyms.
	// CHECK: fmul %st(0), %st(1)
	// CHECK: encoding: [0xdc,0xc9]
	// CHECK: fmul %st(1)
	// CHECK: encoding: [0xd8,0xc9]
	fmul %st, %st(1)
	fmul %st(1), %st

	// CHECK: fadd %st(0), %st(1)
	// CHECK: encoding: [0xdc,0xc1]
	// CHECK: fadd %st(1)
	// CHECK: encoding: [0xd8,0xc1]
	fadd %st, %st(1)
	fadd %st(1), %st


	// rdar://8416805
	// CHECK: xorb %al, %al
	// CHECK: encoding: [0x30,0xc0]
	// CHECK: xorw %di, %di
	// CHECK: encoding: [0x66,0x31,0xff]
	// CHECK: xorl %esi, %esi
	// CHECK: encoding: [0x31,0xf6]
	// CHECK: xorq %rsi, %rsi
	// CHECK: encoding: [0x48,0x31,0xf6]
	clrb %al
	clr %di
	clr %esi
	clr %rsi

	// rdar://8456378
	cltq // CHECK: cltq
	cdqe // CHECK: cltq
	cwde // CHECK: cwtl
	cwtl // CHECK: cwtl

	// rdar://8416805
	cbw // CHECK: cbtw
	cwd // CHECK: cwtd
	cdq // CHECK: cltd
	cqo // CHECK: cqto

	// rdar://8456378 and PR7557 - fstsw
	fstsw %ax
	// CHECK: wait
	// CHECK: fnstsw
	fstsw (%rax)
	// CHECK: wait
	// CHECK: fnstsw (%rax)

	// PR8259
	fstcw (%rsp)
	// CHECK: wait
	// CHECK: fnstcw (%rsp)

	// PR8259
	fstcw (%rsp)
	// CHECK: wait
	// CHECK: fnstcw (%rsp)

	// PR8258
	finit
	// CHECK: wait
	// CHECK: fninit

	fsave 32493
	// CHECK: wait
	// CHECK: fnsave 32493


	// rdar://8456382 - cvtsd2si support.
	cvtsd2si %xmm1, %rax
	// CHECK: cvtsd2si %xmm1, %rax
	// CHECK: encoding: [0xf2,0x48,0x0f,0x2d,0xc1]
	cvtsd2si %xmm1, %eax
	// CHECK: cvtsd2si %xmm1, %eax
	// CHECK: encoding: [0xf2,0x0f,0x2d,0xc1]

	cvtsd2siq %xmm0, %rax // CHECK: cvtsd2si %xmm0, %rax
	cvtsd2sil %xmm0, %eax // CHECK: cvtsd2si %xmm0, %eax
	cvtsd2si %xmm0, %rax // CHECK: cvtsd2si %xmm0, %rax


	cvttpd2dq %xmm1, %xmm0 // CHECK: cvttpd2dq %xmm1, %xmm0
	cvttpd2dq (%rax), %xmm0 // CHECK: cvttpd2dq (%rax), %xmm0

	cvttps2dq %xmm1, %xmm0 // CHECK: cvttps2dq %xmm1, %xmm0
	cvttps2dq (%rax), %xmm0 // CHECK: cvttps2dq (%rax), %xmm0

	// rdar://8456376 - llvm-mc rejects 'roundss'
	roundss $0xE, %xmm0, %xmm0 // CHECK: encoding: [0x66,0x0f,0x3a,0x0a,0xc0,0x0e]
	roundps $0xE, %xmm0, %xmm0 // CHECK: encoding: [0x66,0x0f,0x3a,0x08,0xc0,0x0e]
	roundsd $0xE, %xmm0, %xmm0 // CHECK: encoding: [0x66,0x0f,0x3a,0x0b,0xc0,0x0e]
	roundpd $0xE, %xmm0, %xmm0 // CHECK: encoding: [0x66,0x0f,0x3a,0x09,0xc0,0x0e]


	// rdar://8482675 - 32-bit mem operand support in 64-bit mode (0x67 prefix)
	leal 8(%eax), %esi
	// CHECK: leal 8(%eax), %esi
	// CHECK: encoding: [0x67,0x8d,0x70,0x08]
	leaq 8(%eax), %rsi
	// CHECK: leaq 8(%eax), %rsi
	// CHECK: encoding: [0x67,0x48,0x8d,0x70,0x08]
	leaq 8(%rax), %rsi
	// CHECK: leaq 8(%rax), %rsi
	// CHECK: encoding: [0x48,0x8d,0x70,0x08]


	cvttpd2dq 0xdeadbeef(%ebx,%ecx,8),%xmm5
	// CHECK: cvttpd2dq 3735928559(%ebx,%ecx,8), %xmm5
	// CHECK: encoding: [0x67,0x66,0x0f,0xe6,0xac,0xcb,0xef,0xbe,0xad,0xde]

	// rdar://8490728 - llvm-mc rejects 'movmskpd'
	movmskpd %xmm6, %rax
	// CHECK: movmskpd %xmm6, %eax
	// CHECK: encoding: [0x66,0x0f,0x50,0xc6]
	movmskpd %xmm6, %eax
	// CHECK: movmskpd %xmm6, %eax
	// CHECK: encoding: [0x66,0x0f,0x50,0xc6]

	// rdar://8491845 - Gas supports commuted forms of non-commutable instructions.
	fdivrp %st(0), %st(1) // CHECK: encoding: [0xde,0xf9]
	fdivrp %st(1), %st(0) // CHECK: encoding: [0xde,0xf9]

	fsubrp %st(0), %st(1) // CHECK: encoding: [0xde,0xe9]
	fsubrp %st(1), %st(0) // CHECK: encoding: [0xde,0xe9]

	// also PR8861
	fdivp %st(0), %st(1) // CHECK: encoding: [0xde,0xf1]
	fdivp %st(1), %st(0) // CHECK: encoding: [0xde,0xf1]


	movl foo(%rip), %eax
	// CHECK: movl foo(%rip), %eax
	// CHECK: encoding: [0x8b,0x05,A,A,A,A]
	// CHECK: fixup A - offset: 2, value: foo-4, kind: reloc_riprel_4byte

	movb $12, foo(%rip)
	// CHECK: movb $12, foo(%rip)
	// CHECK: encoding: [0xc6,0x05,A,A,A,A,0x0c]
	// CHECK: fixup A - offset: 2, value: foo-5, kind: reloc_riprel_4byte

	movw $12, foo(%rip)
	// CHECK: movw $12, foo(%rip)
	// CHECK: encoding: [0x66,0xc7,0x05,A,A,A,A,0x0c,0x00]
	// CHECK: fixup A - offset: 3, value: foo-6, kind: reloc_riprel_4byte

	movl $12, foo(%rip)
	// CHECK: movl $12, foo(%rip)
	// CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
	// CHECK: fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte

	movq $12, foo(%rip)
	// CHECK: movq $12, foo(%rip)
	// CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
	// CHECK: fixup A - offset: 3, value: foo-8, kind: reloc_riprel_4byte

	movl foo(%eip), %eax
	// CHECK: movl foo(%eip), %eax
	// CHECK: encoding: [0x67,0x8b,0x05,A,A,A,A]
	// CHECK: fixup A - offset: 3, value: foo-4, kind: reloc_riprel_4byte

	movb $12, foo(%eip)
	// CHECK: movb $12, foo(%eip)
	// CHECK: encoding: [0x67,0xc6,0x05,A,A,A,A,0x0c]
	// CHECK: fixup A - offset: 3, value: foo-5, kind: reloc_riprel_4byte

	movw $12, foo(%eip)
	// CHECK: movw $12, foo(%eip)
	// CHECK: encoding: [0x67,0x66,0xc7,0x05,A,A,A,A,0x0c,0x00]
	// CHECK: fixup A - offset: 4, value: foo-6, kind: reloc_riprel_4byte

	movl $12, foo(%eip)
	// CHECK: movl $12, foo(%eip)
	// CHECK: encoding: [0x67,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
	// CHECK: fixup A - offset: 3, value: foo-8, kind: reloc_riprel_4byte

	movq $12, foo(%eip)
	// CHECK: movq $12, foo(%eip)
	// CHECK: encoding: [0x67,0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
	// CHECK: fixup A - offset: 4, value: foo-8, kind: reloc_riprel_4byte

	// CHECK: addq $-424, %rax
	// CHECK: encoding: [0x48,0x05,0x58,0xfe,0xff,0xff]
	addq $-424, %rax


	// CHECK: movq _foo@GOTPCREL(%rip), %rax
	// CHECK: encoding: [0x48,0x8b,0x05,A,A,A,A]
	// CHECK: fixup A - offset: 3, value: _foo@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	movq _foo@GOTPCREL(%rip), %rax

	// CHECK: movq _foo@GOTPCREL(%rip), %r14
	// CHECK: encoding: [0x4c,0x8b,0x35,A,A,A,A]
	// CHECK: fixup A - offset: 3, value: _foo@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	movq _foo@GOTPCREL(%rip), %r14

	// CHECK: movq _foo@GOTPCREL(%eip), %rax
	// CHECK: encoding: [0x67,0x48,0x8b,0x05,A,A,A,A]
	// CHECK: fixup A - offset: 4, value: _foo@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	movq _foo@GOTPCREL(%eip), %rax

	// CHECK: movq _foo@GOTPCREL(%eip), %r14
	// CHECK: encoding: [0x67,0x4c,0x8b,0x35,A,A,A,A]
	// CHECK: fixup A - offset: 4, value: _foo@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	movq _foo@GOTPCREL(%eip), %r14

	// CHECK: movq (%r13,%rax,8), %r13
	// CHECK: encoding: [0x4d,0x8b,0x6c,0xc5,0x00]
	movq 0x00(%r13,%rax,8),%r13

	// CHECK: testq %rax, %rbx
	// CHECK: encoding: [0x48,0x85,0xc3]
	testq %rax, %rbx

	// CHECK: cmpq %rbx, %r14
	// CHECK: encoding: [0x49,0x39,0xde]
	cmpq %rbx, %r14

	// rdar://7947167

	movsq
	// CHECK: movsq
	// CHECK: encoding: [0x48,0xa5]

	movsl
	// CHECK: movsl
	// CHECK: encoding: [0xa5]

	stosq
	// CHECK: stosq
	// CHECK: encoding: [0x48,0xab]
	stosl
	// CHECK: stosl
	// CHECK: encoding: [0xab]


	// Not moffset forms of moves, they are x86-32 only! rdar://7947184
	movb 0, %al // CHECK: movb 0, %al # encoding: [0x8a,0x04,0x25,0x00,0x00,0x00,0x00]
	movw 0, %ax // CHECK: movw 0, %ax # encoding: [0x66,0x8b,0x04,0x25,0x00,0x00,0x00,0x00]
	movl 0, %eax // CHECK: movl 0, %eax # encoding: [0x8b,0x04,0x25,0x00,0x00,0x00,0x00]

	// CHECK: pushfq # encoding: [0x9c]
	pushf
	// CHECK: pushfq # encoding: [0x9c]
	pushfq
	// CHECK: popfq # encoding: [0x9d]
	popf
	// CHECK: popfq # encoding: [0x9d]
	popfq

	// CHECK: movabsq $-281474976710654, %rax
	// CHECK: encoding: [0x48,0xb8,0x02,0x00,0x00,0x00,0x00,0x00,0xff,0xff]
	movabsq $0xFFFF000000000002, %rax

	// CHECK: movabsq $-281474976710654, %rax
	// CHECK: encoding: [0x48,0xb8,0x02,0x00,0x00,0x00,0x00,0x00,0xff,0xff]
	movq $0xFFFF000000000002, %rax

	// CHECK: movq $-65536, %rax
	// CHECK: encoding: [0x48,0xc7,0xc0,0x00,0x00,0xff,0xff]
	movq $0xFFFFFFFFFFFF0000, %rax

	// CHECK: movq $-256, %rax
	// CHECK: encoding: [0x48,0xc7,0xc0,0x00,0xff,0xff,0xff]
	movq $0xFFFFFFFFFFFFFF00, %rax

	// CHECK: movq $10, %rax
	// CHECK: encoding: [0x48,0xc7,0xc0,0x0a,0x00,0x00,0x00]
	movq $10, %rax

	// CHECK: movabsb -6066930261531658096, %al
	// CHECK: encoding: [0xa0,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsb 0xabcdef1234567890,%al

	// CHECK: movabsw -6066930261531658096, %ax
	// CHECK: encoding: [0x66,0xa1,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsw 0xabcdef1234567890,%ax

	// CHECK: movabsl -6066930261531658096, %eax
	// CHECK: encoding: [0xa1,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsl 0xabcdef1234567890,%eax

	// CHECK: movabsq -6066930261531658096, %rax
	// CHECK: encoding: [0x48,0xa1,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsq 0xabcdef1234567890, %rax

	// CHECK: movabsb %al, -6066930261531658096
	// CHECK: encoding: [0xa2,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsb %al,0xabcdef1234567890

	// CHECK: movabsw %ax, -6066930261531658096
	// CHECK: encoding: [0x66,0xa3,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsw %ax,0xabcdef1234567890

	// CHECK: movabsl %eax, -6066930261531658096
	// CHECK: encoding: [0xa3,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsl %eax,0xabcdef1234567890

	// CHECK: movabsq %rax, -6066930261531658096
	// CHECK: encoding: [0x48,0xa3,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab]
	movabsq %rax,0xabcdef1234567890

	// rdar://8014869
	//
	// CHECK: ret
	// CHECK: encoding: [0xc3]
	retq

	// CHECK: sete %al
	// CHECK: encoding: [0x0f,0x94,0xc0]
	setz %al

	// CHECK: setne %al
	// CHECK: encoding: [0x0f,0x95,0xc0]
	setnz %al

	// CHECK: je 0
	// CHECK: encoding: [0x74,A]
	jz 0

	// CHECK: jne
	// CHECK: encoding: [0x75,A]
	jnz 0

	// PR9264
	btl $1, 0 // CHECK: btl $1, 0 # encoding: [0x0f,0xba,0x24,0x25,0x00,0x00,0x00,0x00,0x01]
	bt $1, 0 // CHECK: btl $1, 0 # encoding: [0x0f,0xba,0x24,0x25,0x00,0x00,0x00,0x00,0x01]

	// rdar://8017515
	btq $0x01,%rdx
	// CHECK: btq $1, %rdx
	// CHECK: encoding: [0x48,0x0f,0xba,0xe2,0x01]

	//rdar://8017633
	// CHECK: movzbl %al, %esi
	// CHECK: encoding: [0x0f,0xb6,0xf0]
	movzx %al, %esi

	// CHECK: movzbq %al, %rsi
	// CHECK: encoding: [0x48,0x0f,0xb6,0xf0]
	movzx %al, %rsi

	// CHECK: movsbw %al, %ax
	// CHECK: encoding: [0x66,0x0f,0xbe,0xc0]
	movsx %al, %ax

	// CHECK: movsbl %al, %eax
	// CHECK: encoding: [0x0f,0xbe,0xc0]
	movsx %al, %eax

	// CHECK: movswl %ax, %eax
	// CHECK: encoding: [0x0f,0xbf,0xc0]
	movsx %ax, %eax

	// CHECK: movsbq %bl, %rax
	// CHECK: encoding: [0x48,0x0f,0xbe,0xc3]
	movsx %bl, %rax

	// CHECK: movswq %cx, %rax
	// CHECK: encoding: [0x48,0x0f,0xbf,0xc1]
	movsx %cx, %rax

	// CHECK: movslq %edi, %rax
	// CHECK: encoding: [0x48,0x63,0xc7]
	movsx %edi, %rax

	// CHECK: movzbw %al, %ax
	// CHECK: encoding: [0x66,0x0f,0xb6,0xc0]
	movzx %al, %ax

	// CHECK: movzbl %al, %eax
	// CHECK: encoding: [0x0f,0xb6,0xc0]
	movzx %al, %eax

	// CHECK: movzwl %ax, %eax
	// CHECK: encoding: [0x0f,0xb7,0xc0]
	movzx %ax, %eax

	// CHECK: movzbq %bl, %rax
	// CHECK: encoding: [0x48,0x0f,0xb6,0xc3]
	movzx %bl, %rax

	// CHECK: movzwq %cx, %rax
	// CHECK: encoding: [0x48,0x0f,0xb7,0xc1]
	movzx %cx, %rax

	// CHECK: movsbw (%rax), %ax
	// CHECK: encoding: [0x66,0x0f,0xbe,0x00]
	movsx (%rax), %ax

	// CHECK: movzbw (%rax), %ax
	// CHECK: encoding: [0x66,0x0f,0xb6,0x00]
	movzx (%rax), %ax


	// rdar://7873482
	// CHECK: [0x65,0x8b,0x04,0x25,0x7c,0x00,0x00,0x00]
	movl %gs:124, %eax

	// CHECK: jmpq *8(%rax)
	// CHECK: encoding: [0xff,0x60,0x08]
	jmp *8(%rax)

	// CHECK: btq $61, -216(%rbp)
	// CHECK: encoding: [0x48,0x0f,0xba,0xa5,0x28,0xff,0xff,0xff,0x3d]
	btq $61, -216(%rbp)


	// rdar://8061602
	L1:
	jecxz L1
	// CHECK: jecxz L1
	// CHECK: encoding: [0x67,0xe3,A]
	jrcxz L1
	// CHECK: jrcxz L1
	// CHECK: encoding: [0xe3,A]

	// PR8061
	xchgl 368(%rax),%ecx
	// CHECK: xchgl %ecx, 368(%rax)
	xchgl %ecx, 368(%rax)
	// CHECK: xchgl %ecx, 368(%rax)

	// rdar://8407548
	xchg 0xdeadbeef(%rbx,%rcx,8),%bl
	// CHECK: xchgb %bl, 3735928559(%rbx,%rcx,8)



	// PR7254
	lock incl 1(%rsp)
	// CHECK: lock
	// CHECK: incl 1(%rsp)

	// rdar://8741045
	lock/incl 1(%rsp)
	// CHECK: lock
	// CHECK: incl 1(%rsp)


	lock addq %rsi, (%rdi)
	// CHECK: lock
	// CHECK: addq %rsi, (%rdi)
	// CHECK: encoding: [0xf0,0x48,0x01,0x37]

	lock subq %rsi, (%rdi)
	// CHECK: lock
	// CHECK: subq %rsi, (%rdi)
	// CHECK: encoding: [0xf0,0x48,0x29,0x37]

	lock andq %rsi, (%rdi)
	// CHECK: lock
	// CHECK: andq %rsi, (%rdi)
	// CHECK: encoding: [0xf0,0x48,0x21,0x37]

	lock orq %rsi, (%rdi)
	// CHECK: lock
	// CHECK: orq %rsi, (%rdi)
	// CHECK: encoding: [0xf0,0x48,0x09,0x37]

	lock xorq %rsi, (%rdi)
	// CHECK: lock
	// CHECK: xorq %rsi, (%rdi)
	// CHECK: encoding: [0xf0,0x48,0x31,0x37]

	xacquire lock addq %rax, (%rax)
	// CHECK: xacquire
	// CHECK: encoding: [0xf2]
	// CHECK: lock
	// CHECK: addq %rax, (%rax)
	// CHECK: encoding: [0xf0,0x48,0x01,0x00]

	xrelease lock addq %rax, (%rax)
	// CHECK: xrelease
	// CHECK: encoding: [0xf3]
	// CHECK: lock
	// CHECK: addq %rax, (%rax)
	// CHECK: encoding: [0xf0,0x48,0x01,0x00]

	// rdar://8033482
	rep movsl
	// CHECK: rep
	// CHECK: movsl
	// CHECK: encoding: [0xf3,0xa5]


	// rdar://8403974
	iret
	// CHECK: iretl
	// CHECK: encoding: [0xcf]
	iretw
	// CHECK: iretw
	// CHECK: encoding: [0x66,0xcf]
	iretl
	// CHECK: iretl
	// CHECK: encoding: [0xcf]
	iretq
	// CHECK: iretq
	// CHECK: encoding: [0x48,0xcf]

	// rdar://8416805
	// CHECK: retw $31438
	// CHECK: encoding: [0x66,0xc2,0xce,0x7a]
	retw $0x7ace

	// CHECK: lretw $31438
	// CHECK: encoding: [0x66,0xca,0xce,0x7a]
	lretw $0x7ace

	// PR8592
	lretq // CHECK: lretq # encoding: [0x48,0xcb]
	lretl // CHECK: lretl # encoding: [0xcb]
	lret // CHECK: lretl # encoding: [0xcb]
	lretw // CHECK: lretw # encoding: [0x66,0xcb]

	// rdar://8403907
	sysret
	// CHECK: sysretl
	// CHECK: encoding: [0x0f,0x07]
	sysretl
	// CHECK: sysretl
	// CHECK: encoding: [0x0f,0x07]
	sysretq
	// CHECK: sysretq
	// CHECK: encoding: [0x48,0x0f,0x07]

	// rdar://8407242
	push %fs
	// CHECK: pushq %fs
	// CHECK: encoding: [0x0f,0xa0]
	push %gs
	// CHECK: pushq %gs
	// CHECK: encoding: [0x0f,0xa8]

	pushw %fs
	// CHECK: pushw %fs
	// CHECK: encoding: [0x66,0x0f,0xa0]
	pushw %gs
	// CHECK: pushw %gs
	// CHECK: encoding: [0x66,0x0f,0xa8]


	pop %fs
	// CHECK: popq %fs
	// CHECK: encoding: [0x0f,0xa1]
	pop %gs
	// CHECK: popq %gs
	// CHECK: encoding: [0x0f,0xa9]

	popw %fs
	// CHECK: popw %fs
	// CHECK: encoding: [0x66,0x0f,0xa1]
	popw %gs
	// CHECK: popw %gs
	// CHECK: encoding: [0x66,0x0f,0xa9]

	// rdar://8438816
	fildq -8(%rsp)
	fildll -8(%rsp)
	// CHECK: fildll -8(%rsp)
	// CHECK: encoding: [0xdf,0x6c,0x24,0xf8]
	// CHECK: fildll -8(%rsp)
	// CHECK: encoding: [0xdf,0x6c,0x24,0xf8]

	// CHECK: callq a
	callq a

	// CHECK: leaq -40(%rbp), %r15
	leaq -40(%rbp), %r15



	// rdar://8013734 - Alias dr6=db6
	mov %dr6, %rax
	mov %db6, %rax
	// CHECK: movq %dr6, %rax
	// CHECK: movq %dr6, %rax


	// INC/DEC encodings.
	incb %al // CHECK: incb %al # encoding: [0xfe,0xc0]
	incw %ax // CHECK: incw %ax # encoding: [0x66,0xff,0xc0]
	incl %eax // CHECK: incl %eax # encoding: [0xff,0xc0]
	decb %al // CHECK: decb %al # encoding: [0xfe,0xc8]
	decw %ax // CHECK: decw %ax # encoding: [0x66,0xff,0xc8]
	decl %eax // CHECK: decl %eax # encoding: [0xff,0xc8]

	// rdar://8416805
	// CHECK: lgdtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x50,0x04]
	lgdt 4(%rax)

	// CHECK: lgdtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x50,0x04]
	lgdtq 4(%rax)

	// CHECK: lidtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x58,0x04]
	lidt 4(%rax)

	// CHECK: lidtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x58,0x04]
	lidtq 4(%rax)

	// CHECK: sgdtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x40,0x04]
	sgdt 4(%rax)

	// CHECK: sgdtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x40,0x04]
	sgdtq 4(%rax)

	// CHECK: sidtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x48,0x04]
	sidt 4(%rax)

	// CHECK: sidtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x48,0x04]
	sidtq 4(%rax)


	// rdar://8208615
	mov (%rsi), %gs // CHECK: movw (%rsi), %gs # encoding: [0x8e,0x2e]
	mov %gs, (%rsi) // CHECK: movw %gs, (%rsi) # encoding: [0x8c,0x2e]


	// rdar://8431864
	//CHECK: divb %bl
	//CHECK: divw %bx
	//CHECK: divl %ecx
	//CHECK: divl 3735928559(%ebx,%ecx,8)
	//CHECK: divl 69
	//CHECK: divl 32493
	//CHECK: divl 3133065982
	//CHECK: divl 305419896
	//CHECK: idivb %bl
	//CHECK: idivw %bx
	//CHECK: idivl %ecx
	//CHECK: idivl 3735928559(%ebx,%ecx,8)
	//CHECK: idivl 69
	//CHECK: idivl 32493
	//CHECK: idivl 3133065982
	//CHECK: idivl 305419896
	div %bl,%al
	div %bx,%ax
	div %ecx,%eax
	div 0xdeadbeef(%ebx,%ecx,8),%eax
	div 0x45,%eax
	div 0x7eed,%eax
	div 0xbabecafe,%eax
	div 0x12345678,%eax
	idiv %bl,%al
	idiv %bx,%ax
	idiv %ecx,%eax
	idiv 0xdeadbeef(%ebx,%ecx,8),%eax
	idiv 0x45,%eax
	idiv 0x7eed,%eax
	idiv 0xbabecafe,%eax
	idiv 0x12345678,%eax

	// PR8524
	movd %rax, %mm5 // CHECK: movd %rax, %mm5 # encoding: [0x48,0x0f,0x6e,0xe8]
	movd %mm5, %rbx // CHECK: movd %mm5, %rbx # encoding: [0x48,0x0f,0x7e,0xeb]
	movq %rax, %mm5 // CHECK: movd %rax, %mm5 # encoding: [0x48,0x0f,0x6e,0xe8]
	movq %mm5, %rbx // CHECK: movd %mm5, %rbx # encoding: [0x48,0x0f,0x7e,0xeb]

	rex64 // CHECK: rex64 # encoding: [0x48]
	data16 // CHECK: data16 # encoding: [0x66]

	// CHECK: data16
	// CHECK: encoding: [0x66]
	// CHECK: lgdtq 4(%rax)
	// CHECK: encoding: [0x0f,0x01,0x50,0x04]
	data16 lgdt 4(%rax)

	// PR8855
	movq 18446744073709551615,%rbx // CHECK: movq -1, %rbx

	// PR8946
	movdqu %xmm0, %xmm1 // CHECK: movdqu %xmm0, %xmm1 # encoding: [0xf3,0x0f,0x6f,0xc8]

	// PR8935
	xgetbv // CHECK: xgetbv # encoding: [0x0f,0x01,0xd0]
	xsetbv // CHECK: xsetbv # encoding: [0x0f,0x01,0xd1]

	// CHECK: loope 0
	// CHECK: encoding: [0xe1,A]
	loopz 0

	// CHECK: loopne 0
	// CHECK: encoding: [0xe0,A]
	loopnz 0

	// CHECK: outsb (%rsi), %dx # encoding: [0x6e]
	// CHECK: outsb
	// CHECK: outsb
	outsb
	outsb %ds:(%rsi), %dx
	outsb (%rsi), %dx

	// CHECK: outsw (%rsi), %dx # encoding: [0x66,0x6f]
	// CHECK: outsw
	// CHECK: outsw
	outsw
	outsw %ds:(%rsi), %dx
	outsw (%rsi), %dx

	// CHECK: outsl (%rsi), %dx # encoding: [0x6f]
	// CHECK: outsl
	outsl
	outsl %ds:(%rsi), %dx
	outsl (%rsi), %dx

	// CHECK: insb %dx, %es:(%rdi) # encoding: [0x6c]
	// CHECK: insb
	insb
	insb %dx, %es:(%rdi)

	// CHECK: insw %dx, %es:(%rdi) # encoding: [0x66,0x6d]
	// CHECK: insw
	insw
	insw %dx, %es:(%rdi)

	// CHECK: insl %dx, %es:(%rdi) # encoding: [0x6d]
	// CHECK: insl
	insl
	insl %dx, %es:(%rdi)

	// CHECK: movsb (%rsi), %es:(%rdi) # encoding: [0xa4]
	// CHECK: movsb
	// CHECK: movsb
	movsb
	movsb %ds:(%rsi), %es:(%rdi)
	movsb (%rsi), %es:(%rdi)

	// CHECK: movsw (%rsi), %es:(%rdi) # encoding: [0x66,0xa5]
	// CHECK: movsw
	// CHECK: movsw
	movsw
	movsw %ds:(%rsi), %es:(%rdi)
	movsw (%rsi), %es:(%rdi)

	// CHECK: movsl (%rsi), %es:(%rdi) # encoding: [0xa5]
	// CHECK: movsl
	// CHECK: movsl
	movsl
	movsl %ds:(%rsi), %es:(%rdi)
	movsl (%rsi), %es:(%rdi)
	// rdar://10883092
	// CHECK: movsl
	movsl (%rsi), (%rdi)

	// CHECK: movsq (%rsi), %es:(%rdi) # encoding: [0x48,0xa5]
	// CHECK: movsq
	// CHECK: movsq
	movsq
	movsq %ds:(%rsi), %es:(%rdi)
	movsq (%rsi), %es:(%rdi)

	// CHECK: lodsb (%rsi), %al # encoding: [0xac]
	// CHECK: lodsb
	// CHECK: lodsb
	// CHECK: lodsb
	// CHECK: lodsb
	lodsb
	lodsb %ds:(%rsi), %al
	lodsb (%rsi), %al
	lods %ds:(%rsi), %al
	lods (%rsi), %al

	// CHECK: lodsw (%rsi), %ax # encoding: [0x66,0xad]
	// CHECK: lodsw
	// CHECK: lodsw
	// CHECK: lodsw
	// CHECK: lodsw
	lodsw
	lodsw %ds:(%rsi), %ax
	lodsw (%rsi), %ax
	lods %ds:(%rsi), %ax
	lods (%rsi), %ax

	// CHECK: lodsl (%rsi), %eax # encoding: [0xad]
	// CHECK: lodsl
	// CHECK: lodsl
	// CHECK: lodsl
	// CHECK: lodsl
	lodsl
	lodsl %ds:(%rsi), %eax
	lodsl (%rsi), %eax
	lods %ds:(%rsi), %eax
	lods (%rsi), %eax

	// CHECK: lodsq (%rsi), %rax # encoding: [0x48,0xad]
	// CHECK: lodsq
	// CHECK: lodsq
	// CHECK: lodsq
	// CHECK: lodsq
	lodsq
	lodsq %ds:(%rsi), %rax
	lodsq (%rsi), %rax
	lods %ds:(%rsi), %rax
	lods (%rsi), %rax

	// CHECK: stosb %al, %es:(%rdi) # encoding: [0xaa]
	// CHECK: stosb
	// CHECK: stosb
	stosb
	stosb %al, %es:(%rdi)
	stos %al, %es:(%rdi)

	// CHECK: stosw %ax, %es:(%rdi) # encoding: [0x66,0xab]
	// CHECK: stosw
	// CHECK: stosw
	stosw
	stosw %ax, %es:(%rdi)
	stos %ax, %es:(%rdi)

	// CHECK: stosl %eax, %es:(%rdi) # encoding: [0xab]
	// CHECK: stosl
	// CHECK: stosl
	stosl
	stosl %eax, %es:(%rdi)
	stos %eax, %es:(%rdi)

	// CHECK: stosq %rax, %es:(%rdi) # encoding: [0x48,0xab]
	// CHECK: stosq
	// CHECK: stosq
	stosq
	stosq %rax, %es:(%rdi)
	stos %rax, %es:(%rdi)

	// CHECK: strw
	// CHECK: encoding: [0x66,0x0f,0x00,0xc8]
	str %ax

	// CHECK: strl
	// CHECK: encoding: [0x0f,0x00,0xc8]
	str %eax

	// CHECK: strw
	// CHECK: encoding: [0x66,0x0f,0x00,0xc8]
	str %ax

	// CHECK: strq
	// CHECK: encoding: [0x48,0x0f,0x00,0xc8]
	str %rax

	// CHECK: movq %rdi, %xmm0
	// CHECK: encoding: [0x66,0x48,0x0f,0x6e,0xc7]
	movq %rdi,%xmm0

	// CHECK: movq %xmm0, %rax
	// CHECK: encoding: [0x66,0x48,0x0f,0x7e,0xc0]
	movq %xmm0, %rax

	// CHECK: movntil %eax, (%rdi)
	// CHECK: encoding: [0x0f,0xc3,0x07]
	// CHECK: movntil
	movntil %eax, (%rdi)
	movnti %eax, (%rdi)

	// CHECK: movntiq %rax, (%rdi)
	// CHECK: encoding: [0x48,0x0f,0xc3,0x07]
	// CHECK: movntiq
	movntiq %rax, (%rdi)
	movnti %rax, (%rdi)

	// CHECK: pclmulqdq $17, %xmm0, %xmm1
	// CHECK: encoding: [0x66,0x0f,0x3a,0x44,0xc8,0x11]
	pclmulhqhqdq %xmm0, %xmm1

	// CHECK: pclmulqdq $1, %xmm0, %xmm1
	// CHECK: encoding: [0x66,0x0f,0x3a,0x44,0xc8,0x01]
	pclmulqdq $1, %xmm0, %xmm1

	// CHECK: pclmulqdq $16, (%rdi), %xmm1
	// CHECK: encoding: [0x66,0x0f,0x3a,0x44,0x0f,0x10]
	pclmullqhqdq (%rdi), %xmm1

	// CHECK: pclmulqdq $0, (%rdi), %xmm1
	// CHECK: encoding: [0x66,0x0f,0x3a,0x44,0x0f,0x00]
	pclmulqdq $0, (%rdi), %xmm1

	// PR10345
	// CHECK: xchgq %rax, %rax
	// CHECK: encoding: [0x48,0x90]
	xchgq %rax, %rax

	// CHECK: xchgl %eax, %eax
	// CHECK: encoding: [0x87,0xc0]
	xchgl %eax, %eax

	// CHECK: xchgw %ax, %ax
	// CHECK: encoding: [0x66,0x90]
	xchgw %ax, %ax

	// CHECK: xchgl %ecx, %eax
	// CHECK: encoding: [0x91]
	xchgl %ecx, %eax

	// CHECK: xchgl %ecx, %eax
	// CHECK: encoding: [0x91]
	xchgl %eax, %ecx

	// CHECK: sysexit
	// CHECK: encoding: [0x0f,0x35]
	sysexit

	// CHECK: sysexitl
	// CHECK: encoding: [0x0f,0x35]
	sysexitl

	// CHECK: sysexitq
	// CHECK: encoding: [0x48,0x0f,0x35]
	sysexitq

	// CHECK: clac
	// CHECK: encoding: [0x0f,0x01,0xca]
	clac

	// CHECK: stac
	// CHECK: encoding: [0x0f,0x01,0xcb]
	stac

	// CHECK: faddp %st(1)
	// CHECK: fmulp %st(1)
	// CHECK: fsubp %st(1)
	// CHECK: fsubrp %st(1)
	// CHECK: fdivp %st(1)
	// CHECK: fdivrp %st(1)
	faddp %st(0), %st(1)
	fmulp %st(0), %st(1)
	fsubp %st(0), %st(1)
	fsubrp %st(0), %st(1)
	fdivp %st(0), %st(1)
	fdivrp %st(0), %st(1)

	// CHECK: faddp %st(1)
	// CHECK: fmulp %st(1)
	// CHECK: fsubp %st(1)
	// CHECK: fsubrp %st(1)
	// CHECK: fdivp %st(1)
	// CHECK: fdivrp %st(1)
	faddp %st(1), %st(0)
	fmulp %st(1), %st(0)
	fsubp %st(1), %st(0)
	fsubrp %st(1), %st(0)
	fdivp %st(1), %st(0)
	fdivrp %st(1), %st(0)

	// CHECK: faddp %st(1)
	// CHECK: fmulp %st(1)
	// CHECK: fsubp %st(1)
	// CHECK: fsubrp %st(1)
	// CHECK: fdivp %st(1)
	// CHECK: fdivrp %st(1)
	faddp %st(1)
	fmulp %st(1)
	fsubp %st(1)
	fsubrp %st(1)
	fdivp %st(1)
	fdivrp %st(1)

	// CHECK: faddp %st(1)
	// CHECK: fmulp %st(1)
	// CHECK: fsubp %st(1)
	// CHECK: fsubrp %st(1)
	// CHECK: fdivp %st(1)
	// CHECK: fdivrp %st(1)
	faddp
	fmulp
	fsubp
	fsubrp
	fdivp
	fdivrp

	// CHECK: fadd %st(1)
	// CHECK: fmul %st(1)
	// CHECK: fsub %st(1)
	// CHECK: fsubr %st(1)
	// CHECK: fdiv %st(1)
	// CHECK: fdivr %st(1)
	fadd %st(1), %st(0)
	fmul %st(1), %st(0)
	fsub %st(1), %st(0)
	fsubr %st(1), %st(0)
	fdiv %st(1), %st(0)
	fdivr %st(1), %st(0)

	// CHECK: fadd %st(0), %st(1)
	// CHECK: fmul %st(0), %st(1)
	// CHECK: fsub %st(0), %st(1)
	// CHECK: fsubr %st(0), %st(1)
	// CHECK: fdiv %st(0), %st(1)
	// CHECK: fdivr %st(0), %st(1)
	fadd %st(0), %st(1)
	fmul %st(0), %st(1)
	fsub %st(0), %st(1)
	fsubr %st(0), %st(1)
	fdiv %st(0), %st(1)
	fdivr %st(0), %st(1)

	// CHECK: fadd %st(1)
	// CHECK: fmul %st(1)
	// CHECK: fsub %st(1)
	// CHECK: fsubr %st(1)
	// CHECK: fdiv %st(1)
	// CHECK: fdivr %st(1)
	fadd %st(1)
	fmul %st(1)
	fsub %st(1)
	fsubr %st(1)
	fdiv %st(1)
	fdivr %st(1)

	// CHECK: movd %xmm0, %eax
	// CHECK: movq %xmm0, %rax
	// CHECK: movq %xmm0, %rax
	// CHECK: vmovd %xmm0, %eax
	// CHECK: vmovq %xmm0, %rax
	// CHECK: vmovq %xmm0, %rax
	movd %xmm0, %eax
	movq %xmm0, %rax
	movq %xmm0, %rax
	vmovd %xmm0, %eax
	vmovd %xmm0, %rax
	vmovq %xmm0, %rax

	// CHECK: seto 3735928559(%r10,%r9,8)
	// CHECK: encoding: [0x43,0x0f,0x90,0x84,0xca,0xef,0xbe,0xad,0xde]
	seto 0xdeadbeef(%r10,%r9,8)

	// CHECK: monitorx
	// CHECK: encoding: [0x0f,0x01,0xfa]
	monitorx

	// CHECK: monitorx
	// CHECK: encoding: [0x0f,0x01,0xfa]
	monitorx %rax, %rcx, %rdx

	// CHECK: mwaitx
	// CHECK: encoding: [0x0f,0x01,0xfb]
	mwaitx

	// CHECK: mwaitx
	// CHECK: encoding: [0x0f,0x01,0xfb]
	mwaitx %rax, %rcx, %rbx

	// CHECK: clzero
	// CHECK: encoding: [0x0f,0x01,0xfc]
	clzero

	// CHECK: clzero
	// CHECK: encoding: [0x0f,0x01,0xfc]
	clzero %rax

	// CHECK: movl %r15d, (%r15,%r15)
	// CHECK: encoding: [0x47,0x89,0x3c,0x3f]
	movl %r15d, (%r15,%r15)

	// CHECK: nopq 3735928559(%rbx,%rcx,8)
	// CHECK: encoding: [0x48,0x0f,0x1f,0x84,0xcb,0xef,0xbe,0xad,0xde]
	nopq 0xdeadbeef(%rbx,%rcx,8)

	// CHECK: nopq %rax
	// CHECK: encoding: [0x48,0x0f,0x1f,0xc0]
	nopq %rax

	// CHECK: rdpid %rax
	// CHECK: encoding: [0xf3,0x0f,0xc7,0xf8]
	rdpid %rax

	// CHECK: ptwritel 3735928559(%rbx,%rcx,8)
	// CHECK: encoding: [0xf3,0x0f,0xae,0xa4,0xcb,0xef,0xbe,0xad,0xde]
	ptwritel 0xdeadbeef(%rbx,%rcx,8)

	// CHECK: ptwritel %eax
	// CHECK: encoding: [0xf3,0x0f,0xae,0xe0]
	ptwritel %eax

	// CHECK: ptwriteq 3735928559(%rbx,%rcx,8)
	// CHECK: encoding: [0xf3,0x48,0x0f,0xae,0xa4,0xcb,0xef,0xbe,0xad,0xde]
	ptwriteq 0xdeadbeef(%rbx,%rcx,8)

	// CHECK: ptwriteq %rax
	// CHECK: encoding: [0xf3,0x48,0x0f,0xae,0xe0]
	ptwriteq %rax
	+
	+// __asm __volatile(
	+// "pushf \n\t"
	+// "popf \n\t"
	+// "rep \n\t"
	+// ".byte 0x0f, 0xa7, 0xd0"
	+// );
	+// CHECK: pushfq
	+// CHECK-NEXT: popfq
	+// CHECK-NEXT: rep
	+// CHECK-NEXT: .byte 15
	+// CHECK-NEXT: .byte 167
	+// CHECK-NEXT: .byte 208
	+pushfq
	+popfq
	+rep
	+.byte 15
	+.byte 167
	+.byte 208
	+
	+// CHECK: lock
	+// CHECK: cmpxchgl
	+ cmp $0, %edx
	+ je 1f
	+ lock
	+1: cmpxchgl %ecx,(%rdi)
	+
	+// CHECK: rep
	+// CHECK-NEXT: byte
	+rep
	+.byte 0xa4 # movsb
	+
	+// CHECK: lock
	+// This line has to be the last one in the file
	+lock
	Index: vendor/llvm/dist-release_60/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll (revision 328362)
	@@ -0,0 +1,46 @@
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	+target triple = "x86_64-scei-ps4"
	+
	+%struct.CFVS = type { %struct.Vec }
	+%struct.Vec = type { i8 }
	+%struct.S = type { i8 }
	+
	+define void @_ZN4CFVSD2Ev(%struct.CFVS* %this) unnamed_addr align 2 !dbg !8 {
	+entry:
	+ %this.addr = alloca %struct.CFVS*, align 8
	+ store %struct.CFVS* %this, %struct.CFVS** %this.addr, align 8
	+ %this1 = load %struct.CFVS, %struct.CFVS* %this.addr, align 8
	+ %m_val = getelementptr inbounds %struct.CFVS, %struct.CFVS* %this1, i32 0, i32 0
	+ ret void
	+}
	+
	+declare dereferenceable(1) %struct.S* @_Z3Getv()
	+
	+!llvm.dbg.cu = !{!0}
	+!llvm.module.flags = !{!3, !4, !5, !6}
	+
	+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
	+!1 = !DIFile(filename: "bz188598-b.cpp", directory: "")
	+!2 = !{}
	+!3 = !{i32 2, !"Dwarf Version", i32 4}
	+!4 = !{i32 2, !"Debug Info Version", i32 3}
	+!5 = !{i32 1, !"wchar_size", i32 2}
	+!6 = !{i32 7, !"PIC Level", i32 2}
	+!8 = distinct !DISubprogram(name: "~CFVS", linkageName: "_ZN4CFVSD2Ev", scope: !9, file: !1, line: 2, type: !28, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !27, variables: !2)
	+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !10, line: 7, size: 8, elements: !11, identifier: "_ZTS4CFVS")
	+!10 = !DIFile(filename: "./bz188598.h", directory: "")
	+!11 = !{!12, !27}
	+!12 = !DIDerivedType(tag: DW_TAG_member, name: "m_val", scope: !9, file: !10, line: 9, baseType: !13, size: 8)
	+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Vec<&Get>", file: !10, line: 4, size: 8, elements: !14, templateParams: !19, identifier: "_ZTS3VecIXadL_Z3GetvEEE")
	+!14 = !{!35}
	+!19 = !{!20}
	+!20 = !DITemplateValueParameter(name: "F", type: !21, value: %struct.S* ()* @_Z3Getv)
	+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64)
	+!22 = !DIDerivedType(tag: DW_TAG_typedef, name: "Func", file: !10, line: 2, baseType: !23)
	+!23 = !DISubroutineType(types: !24)
	+!24 = !{!35}
	+!27 = !DISubprogram(name: "~CFVS", scope: !9, file: !10, line: 8, type: !28, isLocal: false, isDefinition: false, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false)
	+!28 = !DISubroutineType(types: !29)
	+!29 = !{null, !30}
	+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, flags: DIFlagArtificial \| DIFlagObjectPointer)
	+!35 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
	Index: vendor/llvm/dist-release_60/test/ThinLTO/X86/dicompositetype-unique2.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/ThinLTO/X86/dicompositetype-unique2.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/ThinLTO/X86/dicompositetype-unique2.ll (revision 328362)
	@@ -0,0 +1,69 @@
	+; RUN: opt -module-summary -o %t1.bc %s
	+; RUN: opt -module-summary -o %t2.bc %S/Inputs/dicompositetype-unique2.ll
	+; RUN: llvm-lto --thinlto-action=run %t1.bc %t2.bc -thinlto-save-temps=%t3.
	+; RUN: llvm-dis %t3.0.3.imported.bc -o - \| FileCheck %s
	+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t --save-temps \
	+; RUN: -r %t1.bc,_ZN1CD2Ev,pl \
	+; RUN: -r %t1.bc,_ZN4CFVSD2Ev,l \
	+; RUN: -r %t1.bc,_Z3Getv,l \
	+; RUN: -r %t2.bc,_ZN4CFVSD2Ev,pl \
	+; RUN: -r %t2.bc,_Z3Getv,l
	+; RUN: llvm-dis %t.1.3.import.bc -o - \| FileCheck %s
	+
	+; Only llvm-lto2 adds the dso_local keyword, hence the {{.*}}
	+; CHECK: define available_externally{{.*}} void @_ZN4CFVSD2Ev
	+
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	+target triple = "x86_64-scei-ps4"
	+
	+%class.C = type <{ i32 (...)**, %class.A, %struct.CFVS, [6 x i8] }>
	+%class.A = type { %struct.Vec }
	+%struct.Vec = type { i8 }
	+%struct.CFVS = type { %struct.Vec }
	+%struct.S = type { i8 }
	+
	+define void @_ZN1CD2Ev(%class.C* %this) unnamed_addr align 2 !dbg !8 {
	+entry:
	+ %this.addr = alloca %class.C*, align 8
	+ %this1 = load %class.C, %class.C* %this.addr, align 8
	+ %m = getelementptr inbounds %class.C, %class.C* %this1, i32 0, i32 2
	+ call void @_ZN4CFVSD2Ev(%struct.CFVS* %m), !dbg !50
	+ ret void
	+}
	+
	+declare void @_ZN4CFVSD2Ev(%struct.CFVS*) unnamed_addr
	+
	+declare dereferenceable(1) %struct.S* @_Z3Getv()
	+
	+!llvm.dbg.cu = !{!0}
	+!llvm.module.flags = !{!3, !4, !5, !6}
	+
	+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
	+!1 = !DIFile(filename: "bz188598-a.cpp", directory: ".")
	+!2 = !{}
	+!3 = !{i32 2, !"Dwarf Version", i32 4}
	+!4 = !{i32 2, !"Debug Info Version", i32 3}
	+!5 = !{i32 1, !"wchar_size", i32 2}
	+!6 = !{i32 7, !"PIC Level", i32 2}
	+!8 = distinct !DISubprogram(name: "~C", linkageName: "_ZN1CD2Ev", scope: !9, file: !1, line: 9, type: !47, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !46, variables: !2)
	+!9 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C", file: !1, line: 5, size: 128, elements: !10, vtableHolder: !9, identifier: "_ZTS1C")
	+!10 = !{!38, !46}
	+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Vec<&Get>", file: !16, line: 4, size: 8, elements: !17, templateParams: !22, identifier: "_ZTS3VecIXadL_Z3GetvEEE")
	+!16 = !DIFile(filename: "./bz188598.h", directory: ".")
	+!17 = !{!55}
	+!22 = !{!23}
	+!23 = !DITemplateValueParameter(name: "F", type: !24, value: %struct.S* ()* @_Z3Getv)
	+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !25, size: 64)
	+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Func", file: !16, line: 2, baseType: !26)
	+!26 = !DISubroutineType(types: !27)
	+!27 = !{!55}
	+!38 = !DIDerivedType(tag: DW_TAG_member, name: "m", scope: !9, file: !1, line: 7, baseType: !39, size: 8, offset: 72)
	+!39 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !16, line: 7, size: 8, elements: !40, identifier: "_ZTS4CFVS")
	+!40 = !{!41}
	+!41 = !DIDerivedType(tag: DW_TAG_member, name: "m_val", scope: !39, file: !16, line: 9, baseType: !15, size: 8)
	+!46 = !DISubprogram(name: "~C", scope: !9, file: !1, line: 6, type: !47, isLocal: false, isDefinition: false, scopeLine: 6, containingType: !9, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: false)
	+!47 = !DISubroutineType(types: !48)
	+!48 = !{!55}
	+!50 = !DILocation(line: 9, scope: !51)
	+!51 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9)
	+!55 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
	Index: vendor/llvm/dist-release_60/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll (revision 328362)
	@@ -0,0 +1,19 @@
	+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-select=true %s \| FileCheck %s --check-prefix=CHECK
	+target datalayout =
	+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
	+target triple = "x86_64-unknown-linux-gnu"
	+
	+; Select when both offset and scale reg are present.
	+define i64 @test1(i1 %c, i64* %b, i64 %scale) {
	+; CHECK-LABEL: @test1
	+entry:
	+; CHECK-LABEL: entry:
	+ %g = getelementptr inbounds i64, i64* %b, i64 %scale
	+ %g1 = getelementptr inbounds i64, i64* %g, i64 8
	+ %g2 = getelementptr inbounds i64, i64* %g, i64 16
	+ %s = select i1 %c, i64* %g1, i64* %g2
	+; CHECK-NOT: sunkaddr
	+ %v = load i64 , i64* %s, align 8
	+ ret i64 %v
	+}
	+
	Index: vendor/llvm/dist-release_60/test/Transforms/GVNHoist/pr35222-hoist-load.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/GVNHoist/pr35222-hoist-load.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/Transforms/GVNHoist/pr35222-hoist-load.ll (revision 328362)
	@@ -1,25 +1,70 @@
	; RUN: opt -S -gvn-hoist < %s \| FileCheck %s
	+; CHECK-LABEL: build_tree
	; CHECK: load
	; CHECK: load
	; Check that the load is not hoisted because the call can potentially
	; modify the global

	target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"

	@heap = external global i32, align 4

	define i32 @build_tree() unnamed_addr {
	entry:
	br label %do.body

	do.body: ; preds = %do.body, %entry
	%tmp9 = load i32, i32* @heap, align 4
	%cmp = call i1 @pqdownheap(i32 %tmp9)
	br i1 %cmp, label %do.body, label %do.end

	do.end: ; preds = %do.body
	%tmp20 = load i32, i32* @heap, align 4
	ret i32 %tmp20
	}

	declare i1 @pqdownheap(i32)
	+
	+@i = external hidden unnamed_addr global i32, align 4
	+@j = external hidden unnamed_addr global [573 x i32], align 4
	+@v = external global i1
	+
	+; CHECK-LABEL: test
	+; CHECK-LABEL: do.end
	+; CHECK: load
	+; Check that the load is not hoisted because the call can potentially
	+; modify the global
	+
	+define i32 @test() {
	+entry:
	+ br label %for.cond
	+
	+for.cond:
	+ %a3 = load volatile i1, i1* @v
	+ br i1 %a3, label %for.body, label %while.end
	+
	+for.body:
	+ br label %if.then
	+
	+if.then:
	+ %tmp4 = load i32, i32* @i, align 4
	+ br label %for.cond
	+
	+while.end:
	+ br label %do.body
	+
	+do.body:
	+ %tmp9 = load i32, i32* getelementptr inbounds ([573 x i32], [573 x i32]* @j,
	+i32 0, i32 1), align 4
	+ %tmp10 = load i32, i32* @i, align 4
	+ call void @fn()
	+ %a1 = load volatile i1, i1* @v
	+ br i1 %a1, label %do.body, label %do.end
	+
	+do.end:
	+ %tmp20 = load i32, i32* getelementptr inbounds ([573 x i32], [573 x i32]* @j,
	+i32 0, i32 1), align 4
	+ ret i32 %tmp20
	+}
	+
	+declare void @fn()
	Index: vendor/llvm/dist-release_60/test/Transforms/JumpThreading/ddt-crash3.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/JumpThreading/ddt-crash3.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/JumpThreading/ddt-crash3.ll (revision 328362)
	@@ -0,0 +1,43 @@
	+; RUN: opt < %s -jump-threading -disable-output -verify-dom-info
	+
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	+target triple = "x86_64-unknown-linux-gnu"
	+
	+@global = external local_unnamed_addr global i64, align 8
	+@global.1 = external local_unnamed_addr global i64, align 8
	+@global.2 = external local_unnamed_addr global i64, align 8
	+
	+; Function Attrs: norecurse noreturn nounwind uwtable
	+define void @hoge() local_unnamed_addr #0 {
	+bb:
	+ br label %bb1
	+
	+bb1: ; preds = %bb26, %bb
	+ %tmp = load i64, i64* @global, align 8, !tbaa !1
	+ %tmp2 = icmp eq i64 %tmp, 0
	+ br i1 %tmp2, label %bb27, label %bb3
	+
	+bb3: ; preds = %bb1
	+ %tmp4 = load i64, i64* @global.1, align 8, !tbaa !1
	+ %tmp5 = icmp eq i64 %tmp4, 0
	+ br i1 %tmp5, label %bb23, label %bb23
	+
	+bb23: ; preds = %bb3, %bb3
	+ br label %bb26
	+
	+bb26: ; preds = %bb27, %bb23
	+ br label %bb1
	+
	+bb27: ; preds = %bb1
	+ br label %bb26
	+}
	+
	+attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
	+
	+!llvm.ident = !{!0}
	+
	+!0 = !{!"clang version 7.0.0 "}
	+!1 = !{!2, !2, i64 0}
	+!2 = !{!"long", !3, i64 0}
	+!3 = !{!"omnipotent char", !4, i64 0}
	+!4 = !{!"Simple C/C++ TBAA"}
	Index: vendor/llvm/dist-release_60/test/Transforms/JumpThreading/ddt-crash4.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/JumpThreading/ddt-crash4.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/JumpThreading/ddt-crash4.ll (revision 328362)
	@@ -0,0 +1,75 @@
	+; RUN: opt < %s -jump-threading -disable-output -verify-dom-info
	+@global = external global i64, align 8
	+
	+define void @f() {
	+bb:
	+ br label %bb1
	+
	+bb1:
	+ %tmp = load i64, i64* @global, align 8
	+ %tmp2 = icmp eq i64 %tmp, 0
	+ br i1 %tmp2, label %bb27, label %bb3
	+
	+bb3:
	+ %tmp4 = load i64, i64* @global, align 8
	+ %tmp5 = icmp eq i64 %tmp4, 0
	+ br i1 %tmp5, label %bb6, label %bb7
	+
	+bb6:
	+ br label %bb7
	+
	+bb7:
	+ %tmp8 = phi i1 [ true, %bb3 ], [ undef, %bb6 ]
	+ %tmp9 = select i1 %tmp8, i64 %tmp4, i64 0
	+ br i1 false, label %bb10, label %bb23
	+
	+bb10:
	+ %tmp11 = load i64, i64* @global, align 8
	+ %tmp12 = icmp slt i64 %tmp11, 5
	+ br i1 %tmp12, label %bb13, label %bb17
	+
	+bb13:
	+ br label %bb14
	+
	+bb14:
	+ br i1 undef, label %bb15, label %bb16
	+
	+bb15:
	+ unreachable
	+
	+bb16:
	+ br label %bb10
	+
	+bb17:
	+ br label %bb18
	+
	+bb18:
	+ br i1 undef, label %bb22, label %bb13
	+
	+bb19:
	+ br i1 undef, label %bb20, label %bb21
	+
	+bb20:
	+ unreachable
	+
	+bb21:
	+ br label %bb18
	+
	+bb22:
	+ br label %bb23
	+
	+bb23:
	+ br i1 undef, label %bb24, label %bb13
	+
	+bb24:
	+ br i1 undef, label %bb26, label %bb25
	+
	+bb25:
	+ br label %bb19
	+
	+bb26:
	+ br label %bb1
	+
	+bb27:
	+ br label %bb24
	+}
	Index: vendor/llvm/dist-release_60/test/Transforms/LoopVectorize/pr35773.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/LoopVectorize/pr35773.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/LoopVectorize/pr35773.ll (revision 328362)
	@@ -0,0 +1,53 @@
	+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 \| FileCheck %s
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	+@a = common local_unnamed_addr global i32 0, align 4
	+@b = common local_unnamed_addr global i8 0, align 1
	+
	+; Function Attrs: norecurse nounwind uwtable
	+define void @doit1() local_unnamed_addr{
	+entry:
	+ br label %for.body
	+
	+for.body:
	+ %main.iv = phi i32 [ 0, %entry ], [ %inc, %for.body ]
	+
	+ %i8.iv = phi i8 [ 0, %entry ], [ %i8.add, %for.body ]
	+ %i32.iv = phi i32 [ 0, %entry ], [ %i32.add, %for.body ]
	+
	+ %trunc.to.be.converted.to.new.iv = trunc i32 %i32.iv to i8
	+ %i8.add = add i8 %i8.iv, %trunc.to.be.converted.to.new.iv
	+
	+ %noop.conv.under.pse = and i32 %i32.iv, 255
	+ %i32.add = add nuw nsw i32 %noop.conv.under.pse, 9
	+
	+ %inc = add i32 %main.iv, 1
	+ %tobool = icmp eq i32 %inc, 16
	+ br i1 %tobool, label %for.cond.for.end_crit_edge, label %for.body
	+
	+; CHECK-LABEL: @doit1(
	+; CHECK: vector.body:
	+; CHECK-NEXT: [[MAIN_IV:%.]] = phi i32 [ 0, [[VECTOR_PH:%.]] ], [ [[MAIN_IV_NEXT:%.]], [[VECTOR_BODY:%.]] ]
	+; CHECK-NEXT: [[I8_IV:%.]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[I8_IV_NEXT:%.]], [[VECTOR_BODY]] ]
	+; CHECK-NEXT: [[I32_IV:%.]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.]], [[VECTOR_BODY]] ]
	+; CHECK-NEXT: [[IV_FROM_TRUNC:%.]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.]], [[VECTOR_BODY]] ]
	+
	+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[MAIN_IV]], i32 0
	+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
	+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
	+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0
	+
	+; CHECK-NEXT: [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
	+
	+; CHECK-NEXT: [[MAIN_IV_NEXT]] = add i32 [[MAIN_IV]], 4
	+; CHECK-NEXT: [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], <i32 36, i32 36, i32 36, i32 36>
	+; CHECK-NEXT: [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], <i8 36, i8 36, i8 36, i8 36>
	+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
	+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
	+
	+for.cond.for.end_crit_edge:
	+ store i8 %i8.add, i8* @b, align 1
	+ br label %for.end
	+
	+for.end:
	+ ret void
	+}
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35628_1.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35628_1.ll (revision 328362)
	@@ -0,0 +1,74 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu \| FileCheck %s
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
	+
	+define void @mainTest(i32* %ptr) #0 {
	+; CHECK-LABEL: @mainTest(
	+; CHECK-NEXT: entry:
	+; CHECK-NEXT: [[CMP:%.]] = icmp eq i32 [[PTR:%.*]], null
	+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.]], label [[BAIL_OUT:%.]]
	+; CHECK: loop:
	+; CHECK-NEXT: [[DUMMY_PHI:%.]] = phi i32 [ 1, [[ENTRY:%.]] ], [ [[OP_EXTRA5:%.*]], [[LOOP]] ]
	+; CHECK-NEXT: [[TMP0:%.]] = getelementptr inbounds i32, i32 [[PTR]], i64 1
	+; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds i32, i32 [[PTR]], i64 2
	+; CHECK-NEXT: [[TMP2:%.]] = getelementptr inbounds i32, i32 [[PTR]], i64 3
	+; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[PTR]] to <4 x i32>*
	+; CHECK-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[TMP3]], align 4
	+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
	+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
	+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
	+; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
	+; CHECK-NEXT: [[TMP9:%.*]] = add i32 1, undef
	+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], undef
	+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], undef
	+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], undef
	+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], undef
	+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP6]] to i64
	+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], undef
	+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
	+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[RDX_SHUF]]
	+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
	+; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
	+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
	+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP16]], 1
	+; CHECK-NEXT: [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
	+; CHECK-NEXT: [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], [[TMP6]]
	+; CHECK-NEXT: [[OP_EXTRA5]] = add i32 [[OP_EXTRA4]], [[TMP5]]
	+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP15]], undef
	+; CHECK-NEXT: br label [[LOOP]]
	+; CHECK: bail_out:
	+; CHECK-NEXT: ret void
	+;
	+entry:
	+ %cmp = icmp eq i32* %ptr, null
	+ br i1 %cmp, label %loop, label %bail_out
	+
	+loop:
	+ %dummy_phi = phi i32 [ 1, %entry ], [ %18, %loop ]
	+ %0 = load i32, i32 * %ptr , align 4
	+ %1 = mul i32 %0, %0
	+ %2 = add i32 1, %1
	+ %3 = getelementptr inbounds i32, i32 * %ptr, i64 1
	+ %4 = load i32, i32 * %3 , align 4
	+ %5 = mul i32 %4, %4
	+ %6 = add i32 %2, %4
	+ %7 = add i32 %6, %5
	+ %8 = getelementptr inbounds i32, i32 *%ptr, i64 2
	+ %9 = load i32, i32 * %8 , align 4
	+ %10 = mul i32 %9, %9
	+ %11 = add i32 %7, %9
	+ %12 = add i32 %11, %10
	+ %13 = sext i32 %9 to i64
	+ %14 = getelementptr inbounds i32, i32 *%ptr, i64 3
	+ %15 = load i32, i32 * %14 , align 4
	+ %16 = mul i32 %15, %15
	+ %17 = add i32 %12, %15
	+ %18 = add i32 %17, %16
	+ br label %loop
	+
	+bail_out:
	+ ret void
	+}
	+
	+attributes #0 = { "target-cpu"="westmere" }
	+
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35628_2.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35628_2.ll (revision 328362)
	@@ -0,0 +1,64 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell \| FileCheck %s
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
	+
	+define void @test() #0 {
	+; CHECK-LABEL: @test(
	+; CHECK-NEXT: entry:
	+; CHECK-NEXT: br label [[LOOP:%.*]]
	+; CHECK: loop:
	+; CHECK-NEXT: [[DUMMY_PHI:%.]] = phi i64 [ 1, [[ENTRY:%.]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
	+; CHECK-NEXT: [[TMP0:%.]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP6:%.]], [[LOOP]] ]
	+; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
	+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 0
	+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
	+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
	+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
	+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> <i64 3, i64 2, i64 1, i64 0>, [[TMP4]]
	+; CHECK-NEXT: [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
	+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
	+; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
	+; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
	+; CHECK-NEXT: [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
	+; CHECK-NEXT: [[SUM1:%.*]] = add i64 undef, undef
	+; CHECK-NEXT: [[SUM2:%.*]] = add i64 [[SUM1]], undef
	+; CHECK-NEXT: [[ZSUM:%.*]] = add i64 [[SUM2]], 0
	+; CHECK-NEXT: [[JOIN:%.*]] = add i64 undef, [[ZSUM]]
	+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
	+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[RDX_SHUF]]
	+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
	+; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
	+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
	+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
	+; CHECK-NEXT: [[OP_EXTRA3]] = add i64 [[OP_EXTRA]], [[TMP6]]
	+; CHECK-NEXT: [[LAST:%.*]] = add i64 [[JOIN]], undef
	+; CHECK-NEXT: br label [[LOOP]]
	+;
	+entry:
	+ br label %loop
	+
	+loop:
	+ %dummy_phi = phi i64 [ 1, %entry ], [ %last, %loop ]
	+ %0 = phi i64 [ 2, %entry ], [ %fork, %loop ]
	+ %inc1 = add i64 %0, 1
	+ %inc2 = add i64 %0, 2
	+ %inc11 = add i64 1, %inc1
	+ %exact1 = ashr exact i64 %inc11, 32
	+ %inc3 = add i64 %0, 3
	+ %dummy_add = add i16 0, 0
	+ %inc12 = add i64 1, %inc2
	+ %exact2 = ashr exact i64 %inc12, 32
	+ %dummy_shl = shl i64 %inc3, 32
	+ %inc13 = add i64 1, %inc3
	+ %exact3 = ashr exact i64 %inc13, 32
	+ %fork = add i64 %0, 0
	+ %sum1 = add i64 %exact3, %exact2
	+ %sum2 = add i64 %sum1, %exact1
	+ %zsum = add i64 %sum2, 0
	+ %sext22 = add i64 1, %fork
	+ %exact4 = ashr exact i64 %sext22, 32
	+ %join = add i64 %fork, %zsum
	+ %last = add i64 %join, %exact4
	+ br label %loop
	+}
	+
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35777.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35777.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35777.ll (revision 328362)
	@@ -0,0 +1,48 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	+; RUN: opt < %s -verify -slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 \| FileCheck %s
	+
	+@global = local_unnamed_addr global [6 x double] zeroinitializer, align 16
	+
	+define { i64, i64 } @patatino(double %arg) {
	+; CHECK-LABEL: @patatino(
	+; CHECK-NEXT: bb:
	+; CHECK-NEXT: [[TMP0:%.]] = load <2 x double>, <2 x double> bitcast ([6 x double]* @global to <2 x double>*), align 16
	+; CHECK-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
	+; CHECK-NEXT: [[TMP2:%.]] = insertelement <2 x double> undef, double [[ARG:%.]], i32 0
	+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
	+; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
	+; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
	+; CHECK-NEXT: [[TMP6:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
	+; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
	+; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
	+; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
	+; CHECK-NEXT: [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32>
	+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
	+; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
	+; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0
	+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
	+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
	+; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1
	+; CHECK-NEXT: ret { i64, i64 } [[TMP17]]
	+;
	+bb:
	+ %tmp = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
	+ %tmp1 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
	+ %tmp2 = fmul double %tmp1, %arg
	+ %tmp3 = fadd double %tmp, %tmp2
	+ %tmp4 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
	+ %tmp5 = fadd double %tmp4, %tmp3
	+ %tmp6 = fptosi double %tmp5 to i32
	+ %tmp7 = sext i32 %tmp6 to i64
	+ %tmp8 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
	+ %tmp9 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
	+ %tmp10 = fmul double %tmp9, %arg
	+ %tmp11 = fadd double %tmp8, %tmp10
	+ %tmp12 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
	+ %tmp13 = fadd double %tmp12, %tmp11
	+ %tmp14 = fptosi double %tmp13 to i32
	+ %tmp15 = sext i32 %tmp14 to i64
	+ %tmp16 = insertvalue { i64, i64 } undef, i64 %tmp7, 0
	+ %tmp17 = insertvalue { i64, i64 } %tmp16, i64 %tmp15, 1
	+ ret { i64, i64 } %tmp17
	+}
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35865.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35865.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/PR35865.ll (revision 328362)
	@@ -0,0 +1,27 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 \| FileCheck %s
	+
	+define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
	+; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
	+; CHECK-NEXT: entry:
	+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
	+; CHECK-NEXT: [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
	+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
	+; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
	+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
	+; CHECK-NEXT: [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
	+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
	+; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
	+; CHECK-NEXT: ret void
	+;
	+entry:
	+ %0 = extractelement <16 x half> undef, i32 4
	+ %conv.i.4.i = fpext half %0 to float
	+ %1 = bitcast float %conv.i.4.i to i32
	+ %vecins.i.4.i = insertelement <8 x i32> undef, i32 %1, i32 4
	+ %2 = extractelement <16 x half> undef, i32 5
	+ %conv.i.5.i = fpext half %2 to float
	+ %3 = bitcast float %conv.i.5.i to i32
	+ %vecins.i.5.i = insertelement <8 x i32> %vecins.i.4.i, i32 %3, i32 5
	+ ret void
	+}
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll (revision 328362)
	@@ -1,754 +1,754 @@
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s \| FileCheck %s
	; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s \| FileCheck %s --check-prefix=ZEROTHRESH
	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"

	target triple = "x86_64-apple-macosx10.8.0"

	define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select(
	-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
	-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
	+; CHECK-NEXT: [[TMP1:%.]] = icmp ne <4 x i32> [[C:%.]], zeroinitializer
	+; CHECK-NEXT: [[TMP2:%.]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.]], <4 x float> [[B:%.*]]
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
	; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
	; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
	; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
	; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
	; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
	; CHECK-NEXT: ret <4 x float> [[RD]]
	;
	; ZEROTHRESH-LABEL: @simple_select(
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
	-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
	+; ZEROTHRESH-NEXT: [[TMP1:%.]] = icmp ne <4 x i32> [[C:%.]], zeroinitializer
	+; ZEROTHRESH-NEXT: [[TMP2:%.]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.]], <4 x float> [[B:%.*]]
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
	; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
	; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
	; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
	; ZEROTHRESH-NEXT: ret <4 x float> [[RD]]
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%c2 = extractelement <4 x i32> %c, i32 2
	%c3 = extractelement <4 x i32> %c, i32 3
	%a0 = extractelement <4 x float> %a, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%a2 = extractelement <4 x float> %a, i32 2
	%a3 = extractelement <4 x float> %a, i32 3
	%b0 = extractelement <4 x float> %b, i32 0
	%b1 = extractelement <4 x float> %b, i32 1
	%b2 = extractelement <4 x float> %b, i32 2
	%b3 = extractelement <4 x float> %b, i32 3
	%cmp0 = icmp ne i32 %c0, 0
	%cmp1 = icmp ne i32 %c1, 0
	%cmp2 = icmp ne i32 %c2, 0
	%cmp3 = icmp ne i32 %c3, 0
	%s0 = select i1 %cmp0, float %a0, float %b0
	%s1 = select i1 %cmp1, float %a1, float %b1
	%s2 = select i1 %cmp2, float %a2, float %b2
	%s3 = select i1 %cmp3, float %a3, float %b3
	%ra = insertelement <4 x float> undef, float %s0, i32 0
	%rb = insertelement <4 x float> %ra, float %s1, i32 1
	%rc = insertelement <4 x float> %rb, float %s2, i32 2
	%rd = insertelement <4 x float> %rc, float %s3, i32 3
	ret <4 x float> %rd
	}

	declare void @llvm.assume(i1) nounwind

	; This entire tree is ephemeral, don't vectorize any of it.
	define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select_eph(
	-; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
	-; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
	-; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
	-; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
	-; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
	-; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
	-; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
	-; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
	-; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
	-; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
	-; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
	-; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
	+; CHECK-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
	+; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
	+; CHECK-NEXT: [[A0:%.]] = extractelement <4 x float> [[A:%.]], i32 0
	+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
	+; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
	+; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
	+; CHECK-NEXT: [[B0:%.]] = extractelement <4 x float> [[B:%.]], i32 0
	+; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
	+; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
	+; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
	; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
	; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
	; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
	; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
	; CHECK-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
	; CHECK-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
	; CHECK-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
	; CHECK-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
	; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
	; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
	; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
	; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
	; CHECK-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
	; CHECK-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
	; CHECK-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
	; CHECK-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
	; CHECK-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
	; CHECK-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
	; CHECK-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
	; CHECK-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
	; CHECK-NEXT: call void @llvm.assume(i1 [[QI]])
	; CHECK-NEXT: ret <4 x float> undef
	;
	; ZEROTHRESH-LABEL: @simple_select_eph(
	-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
	-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
	-; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
	-; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
	-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
	-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
	-; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
	-; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
	-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
	-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
	-; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
	-; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
	+; ZEROTHRESH-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
	+; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
	+; ZEROTHRESH-NEXT: [[A0:%.]] = extractelement <4 x float> [[A:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
	+; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
	+; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
	+; ZEROTHRESH-NEXT: [[B0:%.]] = extractelement <4 x float> [[B:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
	+; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
	+; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
	; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
	; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
	; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
	; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
	; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
	; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
	; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
	; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
	; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
	; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
	; ZEROTHRESH-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
	; ZEROTHRESH-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
	; ZEROTHRESH-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
	; ZEROTHRESH-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
	; ZEROTHRESH-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
	; ZEROTHRESH-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
	; ZEROTHRESH-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
	; ZEROTHRESH-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
	; ZEROTHRESH-NEXT: call void @llvm.assume(i1 [[QI]])
	; ZEROTHRESH-NEXT: ret <4 x float> undef
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%c2 = extractelement <4 x i32> %c, i32 2
	%c3 = extractelement <4 x i32> %c, i32 3
	%a0 = extractelement <4 x float> %a, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%a2 = extractelement <4 x float> %a, i32 2
	%a3 = extractelement <4 x float> %a, i32 3
	%b0 = extractelement <4 x float> %b, i32 0
	%b1 = extractelement <4 x float> %b, i32 1
	%b2 = extractelement <4 x float> %b, i32 2
	%b3 = extractelement <4 x float> %b, i32 3
	%cmp0 = icmp ne i32 %c0, 0
	%cmp1 = icmp ne i32 %c1, 0
	%cmp2 = icmp ne i32 %c2, 0
	%cmp3 = icmp ne i32 %c3, 0
	%s0 = select i1 %cmp0, float %a0, float %b0
	%s1 = select i1 %cmp1, float %a1, float %b1
	%s2 = select i1 %cmp2, float %a2, float %b2
	%s3 = select i1 %cmp3, float %a3, float %b3
	%ra = insertelement <4 x float> undef, float %s0, i32 0
	%rb = insertelement <4 x float> %ra, float %s1, i32 1
	%rc = insertelement <4 x float> %rb, float %s2, i32 2
	%rd = insertelement <4 x float> %rc, float %s3, i32 3
	%q0 = extractelement <4 x float> %rd, i32 0
	%q1 = extractelement <4 x float> %rd, i32 1
	%q2 = extractelement <4 x float> %rd, i32 2
	%q3 = extractelement <4 x float> %rd, i32 3
	%q4 = fadd float %q0, %q1
	%q5 = fadd float %q2, %q3
	%q6 = fadd float %q4, %q5
	%qi = fcmp olt float %q6, %q5
	call void @llvm.assume(i1 %qi)
	ret <4 x float> undef
	}

	; Insert in an order different from the vector indices to make sure it
	; doesn't matter
	define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select_insert_out_of_order(
	-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
	-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
	+; CHECK-NEXT: [[TMP1:%.]] = icmp ne <4 x i32> [[C:%.]], zeroinitializer
	+; CHECK-NEXT: [[TMP2:%.]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.]], <4 x float> [[B:%.*]]
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
	; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
	; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
	; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
	; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
	; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
	; CHECK-NEXT: ret <4 x float> [[RD]]
	;
	; ZEROTHRESH-LABEL: @simple_select_insert_out_of_order(
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
	-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
	+; ZEROTHRESH-NEXT: [[TMP1:%.]] = icmp ne <4 x i32> [[C:%.]], zeroinitializer
	+; ZEROTHRESH-NEXT: [[TMP2:%.]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.]], <4 x float> [[B:%.*]]
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
	; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
	; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
	; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
	; ZEROTHRESH-NEXT: ret <4 x float> [[RD]]
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%c2 = extractelement <4 x i32> %c, i32 2
	%c3 = extractelement <4 x i32> %c, i32 3
	%a0 = extractelement <4 x float> %a, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%a2 = extractelement <4 x float> %a, i32 2
	%a3 = extractelement <4 x float> %a, i32 3
	%b0 = extractelement <4 x float> %b, i32 0
	%b1 = extractelement <4 x float> %b, i32 1
	%b2 = extractelement <4 x float> %b, i32 2
	%b3 = extractelement <4 x float> %b, i32 3
	%cmp0 = icmp ne i32 %c0, 0
	%cmp1 = icmp ne i32 %c1, 0
	%cmp2 = icmp ne i32 %c2, 0
	%cmp3 = icmp ne i32 %c3, 0
	%s0 = select i1 %cmp0, float %a0, float %b0
	%s1 = select i1 %cmp1, float %a1, float %b1
	%s2 = select i1 %cmp2, float %a2, float %b2
	%s3 = select i1 %cmp3, float %a3, float %b3
	%ra = insertelement <4 x float> undef, float %s0, i32 2
	%rb = insertelement <4 x float> %ra, float %s1, i32 1
	%rc = insertelement <4 x float> %rb, float %s2, i32 0
	%rd = insertelement <4 x float> %rc, float %s3, i32 3
	ret <4 x float> %rd
	}

	declare void @v4f32_user(<4 x float>) #0
	declare void @f32_user(float) #0

	; Multiple users of the final constructed vector
	define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select_users(
	-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
	-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
	+; CHECK-NEXT: [[TMP1:%.]] = icmp ne <4 x i32> [[C:%.]], zeroinitializer
	+; CHECK-NEXT: [[TMP2:%.]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.]], <4 x float> [[B:%.*]]
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
	; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
	; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
	; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
	; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
	; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
	; CHECK-NEXT: call void @v4f32_user(<4 x float> [[RD]]) #0
	; CHECK-NEXT: ret <4 x float> [[RD]]
	;
	; ZEROTHRESH-LABEL: @simple_select_users(
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
	-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
	+; ZEROTHRESH-NEXT: [[TMP1:%.]] = icmp ne <4 x i32> [[C:%.]], zeroinitializer
	+; ZEROTHRESH-NEXT: [[TMP2:%.]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.]], <4 x float> [[B:%.*]]
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
	; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
	; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
	; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
	; ZEROTHRESH-NEXT: call void @v4f32_user(<4 x float> [[RD]]) #0
	; ZEROTHRESH-NEXT: ret <4 x float> [[RD]]
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%c2 = extractelement <4 x i32> %c, i32 2
	%c3 = extractelement <4 x i32> %c, i32 3
	%a0 = extractelement <4 x float> %a, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%a2 = extractelement <4 x float> %a, i32 2
	%a3 = extractelement <4 x float> %a, i32 3
	%b0 = extractelement <4 x float> %b, i32 0
	%b1 = extractelement <4 x float> %b, i32 1
	%b2 = extractelement <4 x float> %b, i32 2
	%b3 = extractelement <4 x float> %b, i32 3
	%cmp0 = icmp ne i32 %c0, 0
	%cmp1 = icmp ne i32 %c1, 0
	%cmp2 = icmp ne i32 %c2, 0
	%cmp3 = icmp ne i32 %c3, 0
	%s0 = select i1 %cmp0, float %a0, float %b0
	%s1 = select i1 %cmp1, float %a1, float %b1
	%s2 = select i1 %cmp2, float %a2, float %b2
	%s3 = select i1 %cmp3, float %a3, float %b3
	%ra = insertelement <4 x float> undef, float %s0, i32 0
	%rb = insertelement <4 x float> %ra, float %s1, i32 1
	%rc = insertelement <4 x float> %rb, float %s2, i32 2
	%rd = insertelement <4 x float> %rc, float %s3, i32 3
	call void @v4f32_user(<4 x float> %rd) #0
	ret <4 x float> %rd
	}

	; Unused insertelement
	define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select_no_users(
	-; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
	-; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
	-; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
	-; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
	-; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
	-; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
	-; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
	-; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
	-; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
	-; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
	-; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
	-; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
	+; CHECK-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
	+; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
	+; CHECK-NEXT: [[A0:%.]] = extractelement <4 x float> [[A:%.]], i32 0
	+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
	+; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
	+; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
	+; CHECK-NEXT: [[B0:%.]] = extractelement <4 x float> [[B:%.]], i32 0
	+; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
	+; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
	+; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
	; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
	; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
	; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
	; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
	; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
	; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
	; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
	; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
	; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
	; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
	; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
	; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
	; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
	; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
	; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
	; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
	; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
	; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
	; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
	; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
	; CHECK-NEXT: ret <4 x float> [[RD]]
	;
	; ZEROTHRESH-LABEL: @simple_select_no_users(
	-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
	-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
	-; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
	-; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
	-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
	-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
	-; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
	-; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
	-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
	-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
	-; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
	-; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
	+; ZEROTHRESH-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
	+; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
	+; ZEROTHRESH-NEXT: [[A0:%.]] = extractelement <4 x float> [[A:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
	+; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
	+; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
	+; ZEROTHRESH-NEXT: [[B0:%.]] = extractelement <4 x float> [[B:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
	+; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
	+; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
	; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
	; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
	; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
	; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
	; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
	; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
	; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
	; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
	; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2
	; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
	; ZEROTHRESH-NEXT: ret <4 x float> [[RD]]
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%c2 = extractelement <4 x i32> %c, i32 2
	%c3 = extractelement <4 x i32> %c, i32 3
	%a0 = extractelement <4 x float> %a, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%a2 = extractelement <4 x float> %a, i32 2
	%a3 = extractelement <4 x float> %a, i32 3
	%b0 = extractelement <4 x float> %b, i32 0
	%b1 = extractelement <4 x float> %b, i32 1
	%b2 = extractelement <4 x float> %b, i32 2
	%b3 = extractelement <4 x float> %b, i32 3
	%cmp0 = icmp ne i32 %c0, 0
	%cmp1 = icmp ne i32 %c1, 0
	%cmp2 = icmp ne i32 %c2, 0
	%cmp3 = icmp ne i32 %c3, 0
	%s0 = select i1 %cmp0, float %a0, float %b0
	%s1 = select i1 %cmp1, float %a1, float %b1
	%s2 = select i1 %cmp2, float %a2, float %b2
	%s3 = select i1 %cmp3, float %a3, float %b3
	%ra = insertelement <4 x float> undef, float %s0, i32 0
	%rb = insertelement <4 x float> %ra, float %s1, i32 1
	%rc = insertelement <4 x float> undef, float %s2, i32 2
	%rd = insertelement <4 x float> %rc, float %s3, i32 3
	ret <4 x float> %rd
	}

	; Make sure infinite loop doesn't happen which I ran into when trying
	; to do this backwards this backwards
	define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
	; CHECK-LABEL: @reconstruct(
	-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> %c, i32 0
	-; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
	-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> %c, i32 1
	-; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP2]], i32 1
	-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> %c, i32 2
	-; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP3]], i32 2
	-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> %c, i32 3
	-; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP4]], i32 3
	+; CHECK-NEXT: [[TMP1:%.]] = extractelement <4 x i32> [[C:%.]], i32 3
	+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[C]], i32 2
	+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[C]], i32 0
	+; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
	+; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP3]], i32 1
	+; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP2]], i32 2
	+; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP1]], i32 3
	; CHECK-NEXT: ret <4 x i32> [[RD]]
	;
	; ZEROTHRESH-LABEL: @reconstruct(
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> %c, i32 0
	-; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
	-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> %c, i32 1
	-; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP2]], i32 1
	-; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> %c, i32 2
	-; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP3]], i32 2
	-; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> %c, i32 3
	-; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP4]], i32 3
	+; ZEROTHRESH-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
	+; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
	+; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
	+; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
	+; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
	+; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
	; ZEROTHRESH-NEXT: ret <4 x i32> [[RD]]
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%c2 = extractelement <4 x i32> %c, i32 2
	%c3 = extractelement <4 x i32> %c, i32 3
	%ra = insertelement <4 x i32> undef, i32 %c0, i32 0
	%rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
	%rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
	%rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
	ret <4 x i32> %rd
	}

	define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select_v2(
	-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> %c, zeroinitializer
	-; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> %a, <2 x float> %b
	+; CHECK-NEXT: [[TMP1:%.]] = icmp ne <2 x i32> [[C:%.]], zeroinitializer
	+; CHECK-NEXT: [[TMP2:%.]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.]], <2 x float> [[B:%.*]]
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
	; CHECK-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
	; CHECK-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
	; CHECK-NEXT: ret <2 x float> [[RB]]
	;
	; ZEROTHRESH-LABEL: @simple_select_v2(
	-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> %c, i32 0
	-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> %c, i32 1
	-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> %a, i32 0
	-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> %a, i32 1
	-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> %b, i32 0
	-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> %b, i32 1
	+; ZEROTHRESH-NEXT: [[C0:%.]] = extractelement <2 x i32> [[C:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1
	+; ZEROTHRESH-NEXT: [[A0:%.]] = extractelement <2 x float> [[A:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1
	+; ZEROTHRESH-NEXT: [[B0:%.]] = extractelement <2 x float> [[B:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1
	; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
	; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
	; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
	; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1
	; ZEROTHRESH-NEXT: ret <2 x float> [[RB]]
	;
	%c0 = extractelement <2 x i32> %c, i32 0
	%c1 = extractelement <2 x i32> %c, i32 1
	%a0 = extractelement <2 x float> %a, i32 0
	%a1 = extractelement <2 x float> %a, i32 1
	%b0 = extractelement <2 x float> %b, i32 0
	%b1 = extractelement <2 x float> %b, i32 1
	%cmp0 = icmp ne i32 %c0, 0
	%cmp1 = icmp ne i32 %c1, 0
	%s0 = select i1 %cmp0, float %a0, float %b0
	%s1 = select i1 %cmp1, float %a1, float %b1
	%ra = insertelement <2 x float> undef, float %s0, i32 0
	%rb = insertelement <2 x float> %ra, float %s1, i32 1
	ret <2 x float> %rb
	}

	; Make sure when we construct partial vectors, we don't keep
	; re-visiting the insertelement chains starting with undef
	; (low cost threshold needed to force this to happen)
	define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
	; CHECK-LABEL: @simple_select_partial_vector(
	-; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
	-; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
	-; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
	-; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
	-; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
	-; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
	+; CHECK-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; CHECK-NEXT: [[A0:%.]] = extractelement <4 x float> [[A:%.]], i32 0
	+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
	+; CHECK-NEXT: [[B0:%.]] = extractelement <4 x float> [[B:%.]], i32 0
	+; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
	; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
	; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
	; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
	; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
	; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
	; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 0
	; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
	; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP10]], i32 1
	; CHECK-NEXT: ret <4 x float> [[RB]]
	;
	; ZEROTHRESH-LABEL: @simple_select_partial_vector(
	-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
	-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
	-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
	-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
	-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
	-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
	+; ZEROTHRESH-NEXT: [[C0:%.]] = extractelement <4 x i32> [[C:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
	+; ZEROTHRESH-NEXT: [[A0:%.]] = extractelement <4 x float> [[A:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
	+; ZEROTHRESH-NEXT: [[B0:%.]] = extractelement <4 x float> [[B:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
	; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
	; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
	; ZEROTHRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
	; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
	; ZEROTHRESH-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
	; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
	; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 0
	; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
	; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP10]], i32 1
	; ZEROTHRESH-NEXT: ret <4 x float> [[RB]]
	;
	%c0 = extractelement <4 x i32> %c, i32 0
	%c1 = extractelement <4 x i32> %c, i32 1
	%a0 = extractelement <4 x float> %a, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%b0 = extractelement <4 x float> %b, i32 0
	%b1 = extractelement <4 x float> %b, i32 1
	%1 = insertelement <2 x i32> undef, i32 %c0, i32 0
	%2 = insertelement <2 x i32> %1, i32 %c1, i32 1
	%3 = icmp ne <2 x i32> %2, zeroinitializer
	%4 = insertelement <2 x float> undef, float %a0, i32 0
	%5 = insertelement <2 x float> %4, float %a1, i32 1
	%6 = insertelement <2 x float> undef, float %b0, i32 0
	%7 = insertelement <2 x float> %6, float %b1, i32 1
	%8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
	%9 = extractelement <2 x float> %8, i32 0
	%ra = insertelement <4 x float> undef, float %9, i32 0
	%10 = extractelement <2 x float> %8, i32 1
	%rb = insertelement <4 x float> %ra, float %10, i32 1
	ret <4 x float> %rb
	}

	; Make sure that vectorization happens even if insertelements operations
	; must be rescheduled. The case here is from compiling Julia.
	define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
	; CHECK-LABEL: @reschedule_extract(
	-; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
	+; CHECK-NEXT: [[TMP1:%.]] = fadd <4 x float> [[A:%.]], [[B:%.*]]
	; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
	; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
	; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
	; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
	; CHECK-NEXT: ret <4 x float> [[V3]]
	;
	; ZEROTHRESH-LABEL: @reschedule_extract(
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
	+; ZEROTHRESH-NEXT: [[TMP1:%.]] = fadd <4 x float> [[A:%.]], [[B:%.*]]
	; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
	; ZEROTHRESH-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
	; ZEROTHRESH-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
	; ZEROTHRESH-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
	; ZEROTHRESH-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
	; ZEROTHRESH-NEXT: ret <4 x float> [[V3]]
	;
	%a0 = extractelement <4 x float> %a, i32 0
	%b0 = extractelement <4 x float> %b, i32 0
	%c0 = fadd float %a0, %b0
	%v0 = insertelement <4 x float> undef, float %c0, i32 0
	%a1 = extractelement <4 x float> %a, i32 1
	%b1 = extractelement <4 x float> %b, i32 1
	%c1 = fadd float %a1, %b1
	%v1 = insertelement <4 x float> %v0, float %c1, i32 1
	%a2 = extractelement <4 x float> %a, i32 2
	%b2 = extractelement <4 x float> %b, i32 2
	%c2 = fadd float %a2, %b2
	%v2 = insertelement <4 x float> %v1, float %c2, i32 2
	%a3 = extractelement <4 x float> %a, i32 3
	%b3 = extractelement <4 x float> %b, i32 3
	%c3 = fadd float %a3, %b3
	%v3 = insertelement <4 x float> %v2, float %c3, i32 3
	ret <4 x float> %v3
	}

	; Check that cost model for vectorization takes credit for
	; instructions that are erased.
	define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
	; CHECK-LABEL: @take_credit(
	-; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
	+; CHECK-NEXT: [[TMP1:%.]] = fadd <4 x float> [[A:%.]], [[B:%.*]]
	; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
	; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
	; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
	; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
	; CHECK-NEXT: ret <4 x float> [[V3]]
	;
	; ZEROTHRESH-LABEL: @take_credit(
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
	+; ZEROTHRESH-NEXT: [[TMP1:%.]] = fadd <4 x float> [[A:%.]], [[B:%.*]]
	; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
	; ZEROTHRESH-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
	; ZEROTHRESH-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
	; ZEROTHRESH-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
	; ZEROTHRESH-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
	; ZEROTHRESH-NEXT: ret <4 x float> [[V3]]
	;
	%a0 = extractelement <4 x float> %a, i32 0
	%b0 = extractelement <4 x float> %b, i32 0
	%c0 = fadd float %a0, %b0
	%a1 = extractelement <4 x float> %a, i32 1
	%b1 = extractelement <4 x float> %b, i32 1
	%c1 = fadd float %a1, %b1
	%a2 = extractelement <4 x float> %a, i32 2
	%b2 = extractelement <4 x float> %b, i32 2
	%c2 = fadd float %a2, %b2
	%a3 = extractelement <4 x float> %a, i32 3
	%b3 = extractelement <4 x float> %b, i32 3
	%c3 = fadd float %a3, %b3
	%v0 = insertelement <4 x float> undef, float %c0, i32 0
	%v1 = insertelement <4 x float> %v0, float %c1, i32 1
	%v2 = insertelement <4 x float> %v1, float %c2, i32 2
	%v3 = insertelement <4 x float> %v2, float %c3, i32 3
	ret <4 x float> %v3
	}

	; Make sure we handle multiple trees that feed one build vector correctly.
	define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
	; CHECK-LABEL: @multi_tree(
	; CHECK-NEXT: entry:
	-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
	-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
	-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
	-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
	+; CHECK-NEXT: [[TMP0:%.]] = insertelement <4 x double> undef, double [[W:%.]], i32 0
	+; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x double> [[TMP0]], double [[X:%.]], i32 1
	+; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x double> [[TMP1]], double [[Y:%.]], i32 2
	+; CHECK-NEXT: [[TMP3:%.]] = insertelement <4 x double> [[TMP2]], double [[Z:%.]], i32 3
	; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
	; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
	; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
	; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
	; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
	; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
	; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
	; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
	; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
	; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
	; CHECK-NEXT: ret <4 x double> [[I4]]
	;
	; ZEROTHRESH-LABEL: @multi_tree(
	; ZEROTHRESH-NEXT: entry:
	-; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
	-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
	-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
	-; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
	+; ZEROTHRESH-NEXT: [[TMP0:%.]] = insertelement <4 x double> undef, double [[W:%.]], i32 0
	+; ZEROTHRESH-NEXT: [[TMP1:%.]] = insertelement <4 x double> [[TMP0]], double [[X:%.]], i32 1
	+; ZEROTHRESH-NEXT: [[TMP2:%.]] = insertelement <4 x double> [[TMP1]], double [[Y:%.]], i32 2
	+; ZEROTHRESH-NEXT: [[TMP3:%.]] = insertelement <4 x double> [[TMP2]], double [[Z:%.]], i32 3
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
	; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
	; ZEROTHRESH-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
	; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
	; ZEROTHRESH-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
	; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
	; ZEROTHRESH-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
	; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
	; ZEROTHRESH-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
	; ZEROTHRESH-NEXT: ret <4 x double> [[I4]]
	;
	entry:
	%t0 = fadd double %w , 0.000000e+00
	%t1 = fadd double %x , 1.000000e+00
	%t2 = fadd double %y , 2.000000e+00
	%t3 = fadd double %z , 3.000000e+00
	%t4 = fmul double %t0, 1.000000e+00
	%i1 = insertelement <4 x double> undef, double %t4, i32 3
	%t5 = fmul double %t1, 1.000000e+00
	%i2 = insertelement <4 x double> %i1, double %t5, i32 2
	%t6 = fmul double %t2, 1.000000e+00
	%i3 = insertelement <4 x double> %i2, double %t6, i32 1
	%t7 = fmul double %t3, 1.000000e+00
	%i4 = insertelement <4 x double> %i3, double %t7, i32 0
	ret <4 x double> %i4
	}

	define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
	; CHECK-LABEL: @_vadd256(
	; CHECK-NEXT: entry:
	-; CHECK-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
	+; CHECK-NEXT: [[TMP0:%.]] = fadd <8 x float> [[A:%.]], [[B:%.*]]
	; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
	; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
	; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
	; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
	; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
	; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
	; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
	; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
	; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
	; CHECK-NEXT: ret <8 x float> [[VECINIT7_I]]
	;
	; ZEROTHRESH-LABEL: @_vadd256(
	; ZEROTHRESH-NEXT: entry:
	-; ZEROTHRESH-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
	+; ZEROTHRESH-NEXT: [[TMP0:%.]] = fadd <8 x float> [[A:%.]], [[B:%.*]]
	; ZEROTHRESH-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
	; ZEROTHRESH-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
	; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
	; ZEROTHRESH-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
	; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
	; ZEROTHRESH-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
	; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
	; ZEROTHRESH-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
	; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
	; ZEROTHRESH-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
	; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
	; ZEROTHRESH-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
	; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
	; ZEROTHRESH-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
	; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
	; ZEROTHRESH-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
	; ZEROTHRESH-NEXT: ret <8 x float> [[VECINIT7_I]]
	;
	entry:
	%vecext = extractelement <8 x float> %a, i32 0
	%vecext1 = extractelement <8 x float> %b, i32 0
	%add = fadd float %vecext, %vecext1
	%vecext2 = extractelement <8 x float> %a, i32 1
	%vecext3 = extractelement <8 x float> %b, i32 1
	%add4 = fadd float %vecext2, %vecext3
	%vecext5 = extractelement <8 x float> %a, i32 2
	%vecext6 = extractelement <8 x float> %b, i32 2
	%add7 = fadd float %vecext5, %vecext6
	%vecext8 = extractelement <8 x float> %a, i32 3
	%vecext9 = extractelement <8 x float> %b, i32 3
	%add10 = fadd float %vecext8, %vecext9
	%vecext11 = extractelement <8 x float> %a, i32 4
	%vecext12 = extractelement <8 x float> %b, i32 4
	%add13 = fadd float %vecext11, %vecext12
	%vecext14 = extractelement <8 x float> %a, i32 5
	%vecext15 = extractelement <8 x float> %b, i32 5
	%add16 = fadd float %vecext14, %vecext15
	%vecext17 = extractelement <8 x float> %a, i32 6
	%vecext18 = extractelement <8 x float> %b, i32 6
	%add19 = fadd float %vecext17, %vecext18
	%vecext20 = extractelement <8 x float> %a, i32 7
	%vecext21 = extractelement <8 x float> %b, i32 7
	%add22 = fadd float %vecext20, %vecext21
	%vecinit.i = insertelement <8 x float> undef, float %add, i32 0
	%vecinit1.i = insertelement <8 x float> %vecinit.i, float %add4, i32 1
	%vecinit2.i = insertelement <8 x float> %vecinit1.i, float %add7, i32 2
	%vecinit3.i = insertelement <8 x float> %vecinit2.i, float %add10, i32 3
	%vecinit4.i = insertelement <8 x float> %vecinit3.i, float %add13, i32 4
	%vecinit5.i = insertelement <8 x float> %vecinit4.i, float %add16, i32 5
	%vecinit6.i = insertelement <8 x float> %vecinit5.i, float %add19, i32 6
	%vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
	ret <8 x float> %vecinit7.i
	}

	attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/insertvalue.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/insertvalue.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/insertvalue.ll (revision 328362)
	@@ -1,189 +1,307 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx \| FileCheck %s

	-; CHECK-LABEL: julia_2xdouble
	-; CHECK: load <2 x double>
	-; CHECK: load <2 x double>
	-; CHECK: fmul <2 x double>
	-; CHECK: fadd <2 x double>
	define void @julia_2xdouble([2 x double]* sret, [2 x double], [2 x double], [2 x double]*) {
	+; CHECK-LABEL: @julia_2xdouble(
	+; CHECK-NEXT: top:
	+; CHECK-NEXT: [[PX0:%.]] = getelementptr inbounds [2 x double], [2 x double] [[TMP2:%.*]], i64 0, i64 0
	+; CHECK-NEXT: [[PY0:%.]] = getelementptr inbounds [2 x double], [2 x double] [[TMP3:%.*]], i64 0, i64 0
	+; CHECK-NEXT: [[PX1:%.]] = getelementptr inbounds [2 x double], [2 x double] [[TMP2]], i64 0, i64 1
	+; CHECK-NEXT: [[TMP4:%.]] = bitcast double [[PX0]] to <2 x double>*
	+; CHECK-NEXT: [[TMP5:%.]] = load <2 x double>, <2 x double> [[TMP4]], align 4
	+; CHECK-NEXT: [[PY1:%.]] = getelementptr inbounds [2 x double], [2 x double] [[TMP3]], i64 0, i64 1
	+; CHECK-NEXT: [[TMP6:%.]] = bitcast double [[PY0]] to <2 x double>*
	+; CHECK-NEXT: [[TMP7:%.]] = load <2 x double>, <2 x double> [[TMP6]], align 4
	+; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
	+; CHECK-NEXT: [[PZ0:%.]] = getelementptr inbounds [2 x double], [2 x double] [[TMP1:%.*]], i64 0, i64 0
	+; CHECK-NEXT: [[PZ1:%.]] = getelementptr inbounds [2 x double], [2 x double] [[TMP1]], i64 0, i64 1
	+; CHECK-NEXT: [[TMP9:%.]] = bitcast double [[PZ0]] to <2 x double>*
	+; CHECK-NEXT: [[TMP10:%.]] = load <2 x double>, <2 x double> [[TMP9]], align 4
	+; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
	+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
	+; CHECK-NEXT: [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
	+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
	+; CHECK-NEXT: [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
	+; CHECK-NEXT: store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
	+; CHECK-NEXT: ret void
	+;
	top:
	%px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
	%x0 = load double, double* %px0, align 4
	%py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
	%y0 = load double, double* %py0, align 4
	%m0 = fmul double %x0, %y0
	%px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
	%x1 = load double, double* %px1, align 4
	%py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
	%y1 = load double, double* %py1, align 4
	%m1 = fmul double %x1, %y1
	%pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
	%z0 = load double, double* %pz0, align 4
	%a0 = fadd double %m0, %z0
	%i0 = insertvalue [2 x double] undef, double %a0, 0
	%pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
	%z1 = load double, double* %pz1, align 4
	%a1 = fadd double %m1, %z1
	%i1 = insertvalue [2 x double] %i0, double %a1, 1
	store [2 x double] %i1, [2 x double]* %0, align 4
	ret void
	}

	-; CHECK-LABEL: julia_4xfloat
	-; CHECK: load <4 x float>
	-; CHECK: load <4 x float>
	-; CHECK: fmul <4 x float>
	-; CHECK: fadd <4 x float>
	define void @julia_4xfloat([4 x float]* sret, [4 x float], [4 x float], [4 x float]*) {
	+; CHECK-LABEL: @julia_4xfloat(
	+; CHECK-NEXT: top:
	+; CHECK-NEXT: [[PX0:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP2:%.*]], i64 0, i64 0
	+; CHECK-NEXT: [[PY0:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP3:%.*]], i64 0, i64 0
	+; CHECK-NEXT: [[PX1:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP2]], i64 0, i64 1
	+; CHECK-NEXT: [[PY1:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP3]], i64 0, i64 1
	+; CHECK-NEXT: [[PX2:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP2]], i64 0, i64 2
	+; CHECK-NEXT: [[PY2:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP3]], i64 0, i64 2
	+; CHECK-NEXT: [[PX3:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP2]], i64 0, i64 3
	+; CHECK-NEXT: [[TMP4:%.]] = bitcast float [[PX0]] to <4 x float>*
	+; CHECK-NEXT: [[TMP5:%.]] = load <4 x float>, <4 x float> [[TMP4]], align 4
	+; CHECK-NEXT: [[PY3:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP3]], i64 0, i64 3
	+; CHECK-NEXT: [[TMP6:%.]] = bitcast float [[PY0]] to <4 x float>*
	+; CHECK-NEXT: [[TMP7:%.]] = load <4 x float>, <4 x float> [[TMP6]], align 4
	+; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
	+; CHECK-NEXT: [[PZ0:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP1:%.*]], i64 0, i64 0
	+; CHECK-NEXT: [[PZ1:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP1]], i64 0, i64 1
	+; CHECK-NEXT: [[PZ2:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP1]], i64 0, i64 2
	+; CHECK-NEXT: [[PZ3:%.]] = getelementptr inbounds [4 x float], [4 x float] [[TMP1]], i64 0, i64 3
	+; CHECK-NEXT: [[TMP9:%.]] = bitcast float [[PZ0]] to <4 x float>*
	+; CHECK-NEXT: [[TMP10:%.]] = load <4 x float>, <4 x float> [[TMP9]], align 4
	+; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
	+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
	+; CHECK-NEXT: [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
	+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
	+; CHECK-NEXT: [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
	+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
	+; CHECK-NEXT: [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
	+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
	+; CHECK-NEXT: [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
	+; CHECK-NEXT: store [4 x float] [[I3]], [4 x float]* [[TMP0:%.*]], align 4
	+; CHECK-NEXT: ret void
	+;
	top:
	%px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
	%x0 = load float, float* %px0, align 4
	%py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
	%y0 = load float, float* %py0, align 4
	%m0 = fmul float %x0, %y0
	%px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
	%x1 = load float, float* %px1, align 4
	%py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
	%y1 = load float, float* %py1, align 4
	%m1 = fmul float %x1, %y1
	%px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
	%x2 = load float, float* %px2, align 4
	%py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
	%y2 = load float, float* %py2, align 4
	%m2 = fmul float %x2, %y2
	%px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
	%x3 = load float, float* %px3, align 4
	%py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
	%y3 = load float, float* %py3, align 4
	%m3 = fmul float %x3, %y3
	%pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
	%z0 = load float, float* %pz0, align 4
	%a0 = fadd float %m0, %z0
	%i0 = insertvalue [4 x float] undef, float %a0, 0
	%pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
	%z1 = load float, float* %pz1, align 4
	%a1 = fadd float %m1, %z1
	%i1 = insertvalue [4 x float] %i0, float %a1, 1
	%pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
	%z2 = load float, float* %pz2, align 4
	%a2 = fadd float %m2, %z2
	%i2 = insertvalue [4 x float] %i1, float %a2, 2
	%pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
	%z3 = load float, float* %pz3, align 4
	%a3 = fadd float %m3, %z3
	%i3 = insertvalue [4 x float] %i2, float %a3, 3
	store [4 x float] %i3, [4 x float]* %0, align 4
	ret void
	}

	-; CHECK-LABEL: julia_load_array_of_float
	-; CHECK: fsub <4 x float>
	define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
	+; CHECK-LABEL: @julia_load_array_of_float(
	+; CHECK-NEXT: top:
	+; CHECK-NEXT: [[TMP0:%.]] = bitcast [4 x float] [[A:%.]] to <4 x float>
	+; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> [[TMP0]], align 4
	+; CHECK-NEXT: [[A_ARR:%.]] = load [4 x float], [4 x float] [[A]], align 4
	+; CHECK-NEXT: [[TMP2:%.]] = bitcast [4 x float] [[B:%.]] to <4 x float>
	+; CHECK-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> [[TMP2]], align 4
	+; CHECK-NEXT: [[B_ARR:%.]] = load [4 x float], [4 x float] [[B]], align 4
	+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
	+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
	+; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
	+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
	+; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
	+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
	+; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
	+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
	+; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
	+; CHECK-NEXT: store [4 x float] [[C_ARR3]], [4 x float]* [[C:%.*]], align 4
	+; CHECK-NEXT: ret void
	+;
	top:
	%a_arr = load [4 x float], [4 x float]* %a, align 4
	%a0 = extractvalue [4 x float] %a_arr, 0
	%a2 = extractvalue [4 x float] %a_arr, 2
	%a1 = extractvalue [4 x float] %a_arr, 1
	%b_arr = load [4 x float], [4 x float]* %b, align 4
	%b0 = extractvalue [4 x float] %b_arr, 0
	%b2 = extractvalue [4 x float] %b_arr, 2
	%b1 = extractvalue [4 x float] %b_arr, 1
	%a3 = extractvalue [4 x float] %a_arr, 3
	%c1 = fsub float %a1, %b1
	%b3 = extractvalue [4 x float] %b_arr, 3
	%c0 = fsub float %a0, %b0
	%c2 = fsub float %a2, %b2
	%c_arr0 = insertvalue [4 x float] undef, float %c0, 0
	%c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
	%c3 = fsub float %a3, %b3
	%c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
	%c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
	store [4 x float] %c_arr3, [4 x float]* %c, align 4
	ret void
	}

	-; CHECK-LABEL: julia_load_array_of_i32
	-; CHECK: load <4 x i32>
	-; CHECK: load <4 x i32>
	-; CHECK: sub <4 x i32>
	define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
	+; CHECK-LABEL: @julia_load_array_of_i32(
	+; CHECK-NEXT: top:
	+; CHECK-NEXT: [[TMP0:%.]] = bitcast [4 x i32] [[A:%.]] to <4 x i32>
	+; CHECK-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> [[TMP0]], align 4
	+; CHECK-NEXT: [[A_ARR:%.]] = load [4 x i32], [4 x i32] [[A]], align 4
	+; CHECK-NEXT: [[TMP2:%.]] = bitcast [4 x i32] [[B:%.]] to <4 x i32>
	+; CHECK-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[TMP2]], align 4
	+; CHECK-NEXT: [[B_ARR:%.]] = load [4 x i32], [4 x i32] [[B]], align 4
	+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
	+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
	+; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
	+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
	+; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
	+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
	+; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
	+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
	+; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
	+; CHECK-NEXT: store [4 x i32] [[C_ARR3]], [4 x i32]* [[C:%.*]], align 4
	+; CHECK-NEXT: ret void
	+;
	top:
	%a_arr = load [4 x i32], [4 x i32]* %a, align 4
	%a0 = extractvalue [4 x i32] %a_arr, 0
	%a2 = extractvalue [4 x i32] %a_arr, 2
	%a1 = extractvalue [4 x i32] %a_arr, 1
	%b_arr = load [4 x i32], [4 x i32]* %b, align 4
	%b0 = extractvalue [4 x i32] %b_arr, 0
	%b2 = extractvalue [4 x i32] %b_arr, 2
	%b1 = extractvalue [4 x i32] %b_arr, 1
	%a3 = extractvalue [4 x i32] %a_arr, 3
	%c1 = sub i32 %a1, %b1
	%b3 = extractvalue [4 x i32] %b_arr, 3
	%c0 = sub i32 %a0, %b0
	%c2 = sub i32 %a2, %b2
	%c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
	%c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
	%c3 = sub i32 %a3, %b3
	%c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
	%c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
	store [4 x i32] %c_arr3, [4 x i32]* %c, align 4
	ret void
	}

	; Almost identical to previous test, but for type that should NOT be vectorized.
	;
	-; CHECK-LABEL: julia_load_array_of_i16
	-; CHECK-NOT: i2>
	define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
	+; CHECK-LABEL: @julia_load_array_of_i16(
	+; CHECK-NEXT: top:
	+; CHECK-NEXT: [[A_ARR:%.]] = load [4 x i16], [4 x i16] [[A:%.*]], align 4
	+; CHECK-NEXT: [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
	+; CHECK-NEXT: [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
	+; CHECK-NEXT: [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
	+; CHECK-NEXT: [[B_ARR:%.]] = load [4 x i16], [4 x i16] [[B:%.*]], align 4
	+; CHECK-NEXT: [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
	+; CHECK-NEXT: [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
	+; CHECK-NEXT: [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
	+; CHECK-NEXT: [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
	+; CHECK-NEXT: [[C1:%.*]] = sub i16 [[A1]], [[B1]]
	+; CHECK-NEXT: [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
	+; CHECK-NEXT: [[C0:%.*]] = sub i16 [[A0]], [[B0]]
	+; CHECK-NEXT: [[C2:%.*]] = sub i16 [[A2]], [[B2]]
	+; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
	+; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
	+; CHECK-NEXT: [[C3:%.*]] = sub i16 [[A3]], [[B3]]
	+; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
	+; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
	+; CHECK-NEXT: store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4
	+; CHECK-NEXT: ret void
	+;
	top:
	%a_arr = load [4 x i16], [4 x i16]* %a, align 4
	%a0 = extractvalue [4 x i16] %a_arr, 0
	%a2 = extractvalue [4 x i16] %a_arr, 2
	%a1 = extractvalue [4 x i16] %a_arr, 1
	%b_arr = load [4 x i16], [4 x i16]* %b, align 4
	%b0 = extractvalue [4 x i16] %b_arr, 0
	%b2 = extractvalue [4 x i16] %b_arr, 2
	%b1 = extractvalue [4 x i16] %b_arr, 1
	%a3 = extractvalue [4 x i16] %a_arr, 3
	%c1 = sub i16 %a1, %b1
	%b3 = extractvalue [4 x i16] %b_arr, 3
	%c0 = sub i16 %a0, %b0
	%c2 = sub i16 %a2, %b2
	%c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
	%c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
	%c3 = sub i16 %a3, %b3
	%c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
	%c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
	store [4 x i16] %c_arr3, [4 x i16]* %c, align 4
	ret void
	}

	%pseudovec = type { float, float, float, float }

	-; CHECK-LABEL: julia_load_struct_of_float
	-; CHECK: load <4 x float>
	-; CHECK: load <4 x float>
	-; CHECK: fsub <4 x float>
	define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
	+; CHECK-LABEL: @julia_load_struct_of_float(
	+; CHECK-NEXT: top:
	+; CHECK-NEXT: [[TMP0:%.]] = bitcast %pseudovec [[A:%.]] to <4 x float>
	+; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> [[TMP0]], align 4
	+; CHECK-NEXT: [[A_STRUCT:%.]] = load [[PSEUDOVEC:%.]], %pseudovec* [[A]], align 4
	+; CHECK-NEXT: [[TMP2:%.]] = bitcast %pseudovec [[B:%.]] to <4 x float>
	+; CHECK-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> [[TMP2]], align 4
	+; CHECK-NEXT: [[B_STRUCT:%.]] = load [[PSEUDOVEC]], %pseudovec [[B]], align 4
	+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
	+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
	+; CHECK-NEXT: [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC]] undef, float [[TMP5]], 0
	+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
	+; CHECK-NEXT: [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct0, float [[TMP6]], 1
	+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
	+; CHECK-NEXT: [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct1, float [[TMP7]], 2
	+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
	+; CHECK-NEXT: [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct2, float [[TMP8]], 3
	+; CHECK-NEXT: store [[PSEUDOVEC]] %c_struct3, %pseudovec* [[C:%.*]], align 4
	+; CHECK-NEXT: ret void
	+;
	top:
	%a_struct = load %pseudovec, %pseudovec* %a, align 4
	%a0 = extractvalue %pseudovec %a_struct, 0
	%a1 = extractvalue %pseudovec %a_struct, 1
	%b_struct = load %pseudovec, %pseudovec* %b, align 4
	%a2 = extractvalue %pseudovec %a_struct, 2
	%b0 = extractvalue %pseudovec %b_struct, 0
	%a3 = extractvalue %pseudovec %a_struct, 3
	%c0 = fsub float %a0, %b0
	%b1 = extractvalue %pseudovec %b_struct, 1
	%b2 = extractvalue %pseudovec %b_struct, 2
	%c1 = fsub float %a1, %b1
	%c_struct0 = insertvalue %pseudovec undef, float %c0, 0
	%b3 = extractvalue %pseudovec %b_struct, 3
	%c3 = fsub float %a3, %b3
	%c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
	%c2 = fsub float %a2, %b2
	%c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
	%c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
	store %pseudovec %c_struct3, %pseudovec* %c, align 4
	ret void
	}
	Index: vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/value-bug.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/value-bug.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/Transforms/SLPVectorizer/X86/value-bug.ll (revision 328362)
	@@ -1,80 +1,110 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx \| FileCheck %s

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	-target triple = "x86_64-grtev3-linux-gnu"

	; We used to crash on this example because we were building a constant
	; expression during vectorization and the vectorizer expects instructions
	; as elements of the vectorized tree.
	-; CHECK-LABEL: @test
	; PR19621

	define void @test() {
	+; CHECK-LABEL: @test(
	+; CHECK-NEXT: bb279:
	+; CHECK-NEXT: br label [[BB283:%.*]]
	+; CHECK: bb283:
	+; CHECK-NEXT: [[TMP0:%.]] = phi <2 x float> [ undef, [[BB279:%.]] ], [ [[TMP11:%.]], [[EXIT:%.]] ]
	+; CHECK-NEXT: [[TMP1:%.]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP13:%.]], [[EXIT]] ]
	+; CHECK-NEXT: br label [[BB284:%.*]]
	+; CHECK: bb284:
	+; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
	+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
	+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
	+; CHECK-NEXT: br label [[BB21_I:%.*]]
	+; CHECK: bb21.i:
	+; CHECK-NEXT: br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
	+; CHECK: bb22.i:
	+; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
	+; CHECK-NEXT: br label [[BB32_I:%.*]]
	+; CHECK: bb32.i:
	+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
	+; CHECK-NEXT: br i1 undef, label [[BB32_I]], label [[BB21_I]]
	+; CHECK: exit:
	+; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
	+; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP7]]
	+; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
	+; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> undef, [[TMP9]]
	+; CHECK-NEXT: [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
	+; CHECK-NEXT: [[TMP317:%.*]] = fptrunc double undef to float
	+; CHECK-NEXT: [[TMP319:%.*]] = fptrunc double undef to float
	+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP317]], i32 0
	+; CHECK-NEXT: [[TMP13]] = insertelement <2 x float> [[TMP12]], float [[TMP319]], i32 1
	+; CHECK-NEXT: br label [[BB283]]
	+;
	bb279:
	br label %bb283

	bb283:
	%Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ]
	%Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ]
	%Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ]
	%Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ]
	br label %bb284

	bb284:
	%tmp7.i = fpext float %Av.sroa.3.0 to double
	%tmp8.i = fsub double %tmp7.i, undef
	%tmp9.i = fsub double %tmp8.i, undef
	%tmp17.i = fpext float %Av.sroa.8.0 to double
	%tmp19.i = fsub double %tmp17.i, undef
	%tmp20.i = fsub double %tmp19.i, undef
	br label %bb21.i

	bb21.i:
	br i1 undef, label %bb22.i, label %exit

	bb22.i:
	%tmp24.i = fadd double undef, %tmp9.i
	%tmp26.i = fadd double undef, %tmp20.i
	br label %bb32.i

	bb32.i:
	%xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
	%ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
	br i1 undef, label %bb32.i, label %bb21.i

	exit:
	%tmp303 = fpext float %Av.sroa.0.0 to double
	%tmp304 = fmul double %tmp303, undef
	%tmp305 = fadd double undef, %tmp304
	%tmp306 = fadd double %tmp305, undef
	%tmp307 = fptrunc double %tmp306 to float
	%tmp311 = fpext float %Av.sroa.5.0 to double
	%tmp312 = fmul double %tmp311, 0.000000e+00
	%tmp313 = fadd double undef, %tmp312
	%tmp314 = fadd double %tmp313, undef
	%tmp315 = fptrunc double %tmp314 to float
	%tmp317 = fptrunc double undef to float
	%tmp319 = fptrunc double undef to float
	br label %bb283
	}

	; Make sure that we probably handle constant folded vectorized trees. The
	; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
	; The code that handles insertelement instructions must handle this.
	define <4 x double> @constant_folding() {
	+; CHECK-LABEL: @constant_folding(
	+; CHECK-NEXT: entry:
	+; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 1
	+; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
	+; CHECK-NEXT: ret <4 x double> [[I2]]
	+;
	entry:
	%t0 = fadd double 1.000000e+00 , 0.000000e+00
	%t1 = fadd double 1.000000e+00 , 1.000000e+00
	%t2 = fmul double %t0, 1.000000e+00
	%i1 = insertelement <4 x double> undef, double %t2, i32 1
	%t3 = fmul double %t1, 1.000000e+00
	%i2 = insertelement <4 x double> %i1, double %t3, i32 0
	ret <4 x double> %i2
	}
	-
	-; CHECK-LABEL: @constant_folding
	-; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0
	-; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1
	-; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1
	-; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0
	-; CHECK: ret <4 x double> %[[V3]]
	Index: vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll (revision 328362)
	@@ -0,0 +1,77 @@
	+; XFAIL: *
	+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s
	+
	+; FIXME: Merge into backedge-id-bug
	+; Variant which has an issue with region construction
	+
	+define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
	+entry:
	+ %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
	+ %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
	+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
	+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
	+ %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
	+ br label %LOOP.HEADER
	+
	+LOOP.HEADER:
	+ %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
	+ call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
	+ %tmp12 = zext i32 %i to i64
	+ %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
	+ %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
	+ %tmp15 = extractelement <4 x i32> %tmp14, i64 0
	+ %tmp16 = and i32 %tmp15, 65535
	+ %tmp17 = icmp eq i32 %tmp16, 1
	+ br i1 %tmp17, label %bb18, label %bb62
	+
	+bb18:
	+ %tmp19 = extractelement <2 x i32> %tmp, i64 0
	+ %tmp22 = lshr i32 %tmp19, 16
	+ %tmp24 = urem i32 %tmp22, 52
	+ %tmp25 = mul nuw nsw i32 %tmp24, 52
	+ br label %INNER_LOOP
	+
	+INNER_LOOP:
	+ %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
	+ call void asm sideeffect "; inner loop body", ""() #0
	+ %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
	+ %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
	+ br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
	+
	+INNER_LOOP_BREAK:
	+ %tmp59 = extractelement <4 x i32> %tmp14, i64 2
	+ call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
	+ br label %END_ELSE_BLOCK
	+
	+bb62:
	+ %load13 = icmp ult i32 %tmp16, 271
	+ ;br i1 %load13, label %bb64, label %INCREMENT_I
	+ ; branching directly to the return avoids the bug
	+ br i1 %load13, label %RETURN, label %INCREMENT_I
	+
	+
	+bb64:
	+ call void asm sideeffect "s_nop 42", "~{memory}"() #0
	+ br label %RETURN
	+
	+INCREMENT_I:
	+ %inc.i = add i32 %i, 1
	+ call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
	+ br label %END_ELSE_BLOCK
	+
	+END_ELSE_BLOCK:
	+ %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
	+ call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
	+ %cmp.end.else.block = icmp eq i32 %i.final, -1
	+ br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
	+
	+RETURN:
	+ call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
	+ store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
	+ ret void
	+}
	+
	+declare i32 @llvm.amdgcn.workitem.id.x() #1
	+
	+attributes #0 = { convergent nounwind }
	+attributes #1 = { convergent nounwind readnone }
	Index: vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll (revision 328362)
	@@ -0,0 +1,163 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s \| FileCheck %s
	+
	+; StructurizeCFG::orderNodes used an arbitrary and nonsensical sorting
	+; function which broke the basic backedge identification algorithm. It
	+; would use RPO order, but then do a weird partial sort by the loop
	+; depth assuming blocks are sorted by loop. However a block can appear
	+; in between blocks of a loop that is not part of a loop, breaking the
	+; assumption of the sort.
	+;
	+; The collectInfos must be done in RPO order. The actual
	+; structurization order I think is less important, but unless the loop
	+; headers are identified in RPO order, it finds the wrong set of back
	+; edges.
	+
	+define amdgpu_kernel void @loop_backedge_misidentified(i32 addrspace(1)* %arg0) #0 {
	+; CHECK-LABEL: @loop_backedge_misidentified(
	+; CHECK-NEXT: entry:
	+; CHECK-NEXT: [[TMP:%.]] = load volatile <2 x i32>, <2 x i32> addrspace(1) undef, align 16
	+; CHECK-NEXT: [[LOAD1:%.]] = load volatile <2 x float>, <2 x float> addrspace(1) undef
	+; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
	+; CHECK-NEXT: [[GEP:%.]] = getelementptr inbounds i32, i32 addrspace(1) [[ARG0:%.*]], i32 [[TID]]
	+; CHECK-NEXT: [[I_INITIAL:%.]] = load volatile i32, i32 addrspace(1) [[GEP]], align 4
	+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
	+; CHECK: LOOP.HEADER:
	+; CHECK-NEXT: [[I:%.]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.]] ], [ [[TMP10:%.]], [[FLOW4:%.]] ]
	+; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b
	+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64
	+; CHECK-NEXT: [[TMP13:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1) null, i64 [[TMP12]]
	+; CHECK-NEXT: [[TMP14:%.]] = load <4 x i32>, <4 x i32> addrspace(1) [[TMP13]], align 16
	+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
	+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535
	+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
	+; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TMP17]], true
	+; CHECK-NEXT: br i1 [[TMP0]], label [[BB62:%.]], label [[FLOW:%.]]
	+; CHECK: Flow2:
	+; CHECK-NEXT: br label [[FLOW]]
	+; CHECK: bb18:
	+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
	+; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
	+; CHECK-NEXT: [[TMP24:%.*]] = urem i32 [[TMP22]], 52
	+; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
	+; CHECK-NEXT: br label [[INNER_LOOP:%.*]]
	+; CHECK: Flow3:
	+; CHECK-NEXT: [[TMP1:%.]] = phi i32 [ [[TMP59:%.]], [[INNER_LOOP_BREAK:%.]] ], [ [[TMP7:%.]], [[FLOW]] ]
	+; CHECK-NEXT: [[TMP2:%.]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.]], [[FLOW]] ]
	+; CHECK-NEXT: br i1 [[TMP2]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW4]]
	+; CHECK: INNER_LOOP:
	+; CHECK-NEXT: [[INNER_LOOP_J:%.]] = phi i32 [ [[INNER_LOOP_J_INC:%.]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
	+; CHECK-NEXT: call void asm sideeffect "
	+; CHECK-NEXT: [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
	+; CHECK-NEXT: [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
	+; CHECK-NEXT: br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
	+; CHECK: INNER_LOOP_BREAK:
	+; CHECK-NEXT: [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
	+; CHECK-NEXT: call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
	+; CHECK-NEXT: br label [[FLOW3:%.*]]
	+; CHECK: bb62:
	+; CHECK-NEXT: [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
	+; CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[LOAD13]], true
	+; CHECK-NEXT: br i1 [[TMP3]], label [[INCREMENT_I:%.]], label [[FLOW1:%.]]
	+; CHECK: Flow1:
	+; CHECK-NEXT: [[TMP4:%.]] = phi i32 [ [[INC_I:%.]], [[INCREMENT_I]] ], [ undef, [[BB62]] ]
	+; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
	+; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ]
	+; CHECK-NEXT: br i1 [[TMP6]], label [[BB64:%.]], label [[FLOW2:%.]]
	+; CHECK: bb64:
	+; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #0
	+; CHECK-NEXT: br label [[FLOW2]]
	+; CHECK: Flow:
	+; CHECK-NEXT: [[TMP7]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP_HEADER]] ]
	+; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ false, [[LOOP_HEADER]] ]
	+; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[LOOP_HEADER]] ]
	+; CHECK-NEXT: br i1 [[TMP9]], label [[BB18]], label [[FLOW3]]
	+; CHECK: INCREMENT_I:
	+; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1
	+; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336
	+; CHECK-NEXT: br label [[FLOW1]]
	+; CHECK: END_ELSE_BLOCK:
	+; CHECK-NEXT: [[I_FINAL:%.*]] = phi i32 [ [[TMP1]], [[FLOW3]] ]
	+; CHECK-NEXT: call void asm sideeffect "s_nop 0x1337
	+; CHECK-NEXT: [[CMP_END_ELSE_BLOCK:%.*]] = icmp eq i32 [[I_FINAL]], -1
	+; CHECK-NEXT: br label [[FLOW4]]
	+; CHECK: Flow4:
	+; CHECK-NEXT: [[TMP10]] = phi i32 [ [[I_FINAL]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW3]] ]
	+; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW3]] ]
	+; CHECK-NEXT: br i1 [[TMP11]], label [[RETURN:%.*]], label [[LOOP_HEADER]]
	+; CHECK: RETURN:
	+; CHECK-NEXT: call void asm sideeffect "s_nop 0x99
	+; CHECK-NEXT: store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
	+; CHECK-NEXT: ret void
	+;
	+entry:
	+ %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
	+ %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
	+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
	+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
	+ %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
	+ br label %LOOP.HEADER
	+
	+LOOP.HEADER:
	+ %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
	+ call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
	+ %tmp12 = zext i32 %i to i64
	+ %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
	+ %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
	+ %tmp15 = extractelement <4 x i32> %tmp14, i64 0
	+ %tmp16 = and i32 %tmp15, 65535
	+ %tmp17 = icmp eq i32 %tmp16, 1
	+ br i1 %tmp17, label %bb18, label %bb62
	+
	+bb18:
	+ %tmp19 = extractelement <2 x i32> %tmp, i64 0
	+ %tmp22 = lshr i32 %tmp19, 16
	+ %tmp24 = urem i32 %tmp22, 52
	+ %tmp25 = mul nuw nsw i32 %tmp24, 52
	+ br label %INNER_LOOP
	+
	+INNER_LOOP:
	+ %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
	+ call void asm sideeffect "; inner loop body", ""() #0
	+ %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
	+ %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
	+ br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
	+
	+INNER_LOOP_BREAK:
	+ %tmp59 = extractelement <4 x i32> %tmp14, i64 2
	+ call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
	+ br label %END_ELSE_BLOCK
	+
	+bb62:
	+ %load13 = icmp ult i32 %tmp16, 271
	+ br i1 %load13, label %bb64, label %INCREMENT_I
	+
	+bb64:
	+ call void asm sideeffect "s_nop 42", "~{memory}"() #0
	+ br label %RETURN
	+
	+INCREMENT_I:
	+ %inc.i = add i32 %i, 1
	+ call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
	+ br label %END_ELSE_BLOCK
	+
	+END_ELSE_BLOCK:
	+ %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
	+ call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
	+ %cmp.end.else.block = icmp eq i32 %i.final, -1
	+ br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
	+
	+RETURN:
	+ call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
	+ store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
	+ ret void
	+}
	+
	+; The same function, except break to return block goes directly to the
	+; return, which managed to hide the bug.
	+; FIXME: Merge variant from backedge-id-bug-xfail
	+
	+declare i32 @llvm.amdgcn.workitem.id.x() #1
	+
	+attributes #0 = { convergent nounwind }
	+attributes #1 = { convergent nounwind readnone }
	Index: vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg (nonexistent)
	+++ vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg (revision 328362)
	@@ -0,0 +1,2 @@
	+if not 'AMDGPU' in config.root.targets:
	+ config.unsupported = True
	Index: vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/nested-loop-order.ll
	===================================================================
	--- vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/nested-loop-order.ll (revision 328361)
	+++ vendor/llvm/dist-release_60/test/Transforms/StructurizeCFG/nested-loop-order.ll (revision 328362)
	@@ -1,68 +1,97 @@
	+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -structurizecfg %s -o - \| FileCheck %s

	define void @main(float addrspace(1)* %out) {
	-
	-; CHECK: main_body:
	-; CHECK: br label %LOOP.outer
	+; CHECK-LABEL: @main(
	+; CHECK-NEXT: main_body:
	+; CHECK-NEXT: br label [[LOOP_OUTER:%.*]]
	+; CHECK: LOOP.outer:
	+; CHECK-NEXT: [[TEMP8_0_PH:%.]] = phi float [ 0.000000e+00, [[MAIN_BODY:%.]] ], [ [[TMP13:%.]], [[FLOW3:%.]] ]
	+; CHECK-NEXT: [[TEMP4_0_PH:%.]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP12:%.]], [[FLOW3]] ]
	+; CHECK-NEXT: br label [[LOOP:%.*]]
	+; CHECK: LOOP:
	+; CHECK-NEXT: [[TMP0:%.]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP12]], [[FLOW:%.]] ]
	+; CHECK-NEXT: [[TMP1:%.*]] = phi float [ undef, [[LOOP_OUTER]] ], [ [[TMP13]], [[FLOW]] ]
	+; CHECK-NEXT: [[TEMP4_0:%.]] = phi i32 [ [[TEMP4_0_PH]], [[LOOP_OUTER]] ], [ [[TMP15:%.]], [[FLOW]] ]
	+; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TEMP4_0]], 1
	+; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], 3
	+; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[TMP22]], true
	+; CHECK-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
	+; CHECK: Flow2:
	+; CHECK-NEXT: [[TMP3:%.]] = phi float [ [[TEMP8_0_PH]], [[IF29:%.]] ], [ [[TMP9:%.]], [[FLOW1:%.]] ]
	+; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP20]], [[IF29]] ], [ undef, [[FLOW1]] ]
	+; CHECK-NEXT: [[TMP5:%.]] = phi i1 [ [[TMP32:%.]], [[IF29]] ], [ true, [[FLOW1]] ]
	+; CHECK-NEXT: br label [[FLOW]]
	+; CHECK: Flow3:
	+; CHECK-NEXT: br i1 [[TMP16:%.]], label [[ENDLOOP:%.]], label [[LOOP_OUTER]]
	+; CHECK: ENDLOOP:
	+; CHECK-NEXT: [[TEMP8_1:%.]] = phi float [ [[TMP14:%.]], [[FLOW3]] ]
	+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
	+; CHECK-NEXT: [[DOT45:%.*]] = select i1 [[TMP23]], float 0.000000e+00, float 1.000000e+00
	+; CHECK-NEXT: store float [[DOT45]], float addrspace(1)* [[OUT:%.*]]
	+; CHECK-NEXT: ret void
	+; CHECK: ENDIF:
	+; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP20]], 1
	+; CHECK-NEXT: [[TMP6:%.*]] = xor i1 [[TMP31]], true
	+; CHECK-NEXT: br i1 [[TMP6]], label [[ENDIF28:%.*]], label [[FLOW1]]
	+; CHECK: Flow1:
	+; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[TMP20]], [[ENDIF28]] ], [ [[TMP0]], [[ENDIF]] ]
	+; CHECK-NEXT: [[TMP8:%.]] = phi float [ [[TMP35:%.]], [[ENDIF28]] ], [ [[TMP1]], [[ENDIF]] ]
	+; CHECK-NEXT: [[TMP9]] = phi float [ [[TMP35]], [[ENDIF28]] ], [ [[TEMP8_0_PH]], [[ENDIF]] ]
	+; CHECK-NEXT: [[TMP10:%.]] = phi i1 [ [[TMP36:%.]], [[ENDIF28]] ], [ true, [[ENDIF]] ]
	+; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ false, [[ENDIF28]] ], [ true, [[ENDIF]] ]
	+; CHECK-NEXT: br i1 [[TMP11]], label [[IF29]], label [[FLOW2:%.*]]
	+; CHECK: IF29:
	+; CHECK-NEXT: [[TMP32]] = icmp sgt i32 [[TMP20]], 2
	+; CHECK-NEXT: br label [[FLOW2]]
	+; CHECK: Flow:
	+; CHECK-NEXT: [[TMP12]] = phi i32 [ [[TMP7]], [[FLOW2]] ], [ [[TMP0]], [[LOOP]] ]
	+; CHECK-NEXT: [[TMP13]] = phi float [ [[TMP8]], [[FLOW2]] ], [ [[TMP1]], [[LOOP]] ]
	+; CHECK-NEXT: [[TMP14]] = phi float [ [[TMP3]], [[FLOW2]] ], [ [[TEMP8_0_PH]], [[LOOP]] ]
	+; CHECK-NEXT: [[TMP15]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP]] ]
	+; CHECK-NEXT: [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW2]] ], [ true, [[LOOP]] ]
	+; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ true, [[LOOP]] ]
	+; CHECK-NEXT: br i1 [[TMP17]], label [[FLOW3]], label [[LOOP]]
	+; CHECK: ENDIF28:
	+; CHECK-NEXT: [[TMP35]] = fadd float [[TEMP8_0_PH]], 1.000000e+00
	+; CHECK-NEXT: [[TMP36]] = icmp sgt i32 [[TMP20]], 2
	+; CHECK-NEXT: br label [[FLOW1]]
	+;
	main_body:
	br label %LOOP.outer

	-; CHECK: LOOP.outer:
	-; CHECK: br label %LOOP
	LOOP.outer: ; preds = %ENDIF28, %main_body
	%temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
	%temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
	br label %LOOP

	-; CHECK: LOOP:
	-; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
	LOOP: ; preds = %IF29, %LOOP.outer
	%temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
	%tmp20 = add i32 %temp4.0, 1
	%tmp22 = icmp sgt i32 %tmp20, 3
	br i1 %tmp22, label %ENDLOOP, label %ENDIF

	-; CHECK: Flow3
	-; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
	-
	-; CHECK: ENDLOOP:
	-; CHECK: ret void
	ENDLOOP: ; preds = %ENDIF28, %IF29, %LOOP
	%temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
	%tmp23 = icmp eq i32 %tmp20, 3
	%.45 = select i1 %tmp23, float 0.000000e+00, float 1.000000e+00
	store float %.45, float addrspace(1)* %out
	ret void

	-; CHECK: ENDIF:
	-; CHECK: br i1 %tmp31, label %IF29, label %Flow1
	ENDIF: ; preds = %LOOP
	%tmp31 = icmp sgt i32 %tmp20, 1
	br i1 %tmp31, label %IF29, label %ENDIF28

	-; CHECK: Flow:
	-; CHECK: br i1 %{{[0-9]+}}, label %Flow2, label %LOOP
	-
	-; CHECK: IF29:
	-; CHECK: br label %Flow1
	IF29: ; preds = %ENDIF
	%tmp32 = icmp sgt i32 %tmp20, 2
	br i1 %tmp32, label %ENDLOOP, label %LOOP

	-; CHECK: Flow1:
	-; CHECK: br label %Flow
	-
	-; CHECK: Flow2:
	-; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
	-
	-; CHECK: ENDIF28:
	-; CHECK: br label %Flow3
	ENDIF28: ; preds = %ENDIF
	%tmp35 = fadd float %temp8.0.ph, 1.0
	%tmp36 = icmp sgt i32 %tmp20, 2
	br i1 %tmp36, label %ENDLOOP, label %LOOP.outer
	}

	attributes #0 = { "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
	attributes #1 = { nounwind readnone }
	attributes #2 = { readnone }
	Index: vendor/llvm/dist-release_60/test/tools/llvm-readobj/macho-needed-libs.test
	===================================================================
	--- vendor/llvm/dist-release_60/test/tools/llvm-readobj/macho-needed-libs.test (nonexistent)
	+++ vendor/llvm/dist-release_60/test/tools/llvm-readobj/macho-needed-libs.test (revision 328362)
	@@ -0,0 +1,26 @@
	+# RUN: yaml2obj %s -o %t.o
	+# RUN: llvm-readobj -needed-libs %t.o \| FileCheck %s
	+
	+# CHECK: NeededLibraries [
	+# CHECK-NEXT: /usr/lib/libSystem.B.dylib
	+# CHECK-NEXT: ]
	+
	+!mach-o
	+FileHeader:
	+ magic: 0xFEEDFACF
	+ cputype: 0x01000007
	+ cpusubtype: 0x00000003
	+ filetype: 0x00000001
	+ ncmds: 1
	+ sizeofcmds: 56
	+ flags: 0x00002000
	+ reserved: 0x00000000
	+LoadCommands:
	+ - cmd: LC_LOAD_DYLIB
	+ cmdsize: 56
	+ dylib:
	+ name: 24
	+ timestamp: 2
	+ current_version: 81985536
	+ compatibility_version: 65536
	+ PayloadString: /usr/lib/libSystem.B.dylib
	Index: vendor/llvm/dist-release_60/tools/llvm-readobj/MachODumper.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/tools/llvm-readobj/MachODumper.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/tools/llvm-readobj/MachODumper.cpp (revision 328362)
	@@ -1,844 +1,874 @@
	//===-- MachODump.cpp - Object file dumping utility for llvm --------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the MachO-specific dumper for llvm-readobj.
	//
	//===----------------------------------------------------------------------===//

	#include "Error.h"
	#include "ObjDumper.h"
	#include "StackMapPrinter.h"
	#include "llvm-readobj.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/Object/MachO.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ScopedPrinter.h"

	using namespace llvm;
	using namespace object;

	namespace {

	class MachODumper : public ObjDumper {
	public:
	MachODumper(const MachOObjectFile *Obj, ScopedPrinter &Writer)
	: ObjDumper(Writer), Obj(Obj) {}

	void printFileHeaders() override;
	void printSections() override;
	void printRelocations() override;
	void printSymbols() override;
	void printDynamicSymbols() override;
	void printUnwindInfo() override;
	void printStackMap() const override;

	+ void printNeededLibraries() override;
	+
	// MachO-specific.
	void printMachODataInCode() override;
	void printMachOVersionMin() override;
	void printMachODysymtab() override;
	void printMachOSegment() override;
	void printMachOIndirectSymbols() override;
	void printMachOLinkerOptions () override;

	private:
	template<class MachHeader>
	void printFileHeaders(const MachHeader &Header);

	void printSymbol(const SymbolRef &Symbol);

	void printRelocation(const RelocationRef &Reloc);

	void printRelocation(const MachOObjectFile *Obj, const RelocationRef &Reloc);

	void printSections(const MachOObjectFile *Obj);

	const MachOObjectFile *Obj;
	};

	} // namespace


	namespace llvm {

	std::error_code createMachODumper(const object::ObjectFile *Obj,
	ScopedPrinter &Writer,
	std::unique_ptr<ObjDumper> &Result) {
	const MachOObjectFile *MachOObj = dyn_cast<MachOObjectFile>(Obj);
	if (!MachOObj)
	return readobj_error::unsupported_obj_file_format;

	Result.reset(new MachODumper(MachOObj, Writer));
	return readobj_error::success;
	}

	} // namespace llvm

	static const EnumEntry<uint32_t> MachOMagics[] = {
	{ "Magic", MachO::MH_MAGIC },
	{ "Cigam", MachO::MH_CIGAM },
	{ "Magic64", MachO::MH_MAGIC_64 },
	{ "Cigam64", MachO::MH_CIGAM_64 },
	{ "FatMagic", MachO::FAT_MAGIC },
	{ "FatCigam", MachO::FAT_CIGAM },
	};

	static const EnumEntry<uint32_t> MachOHeaderFileTypes[] = {
	{ "Relocatable", MachO::MH_OBJECT },
	{ "Executable", MachO::MH_EXECUTE },
	{ "FixedVMLibrary", MachO::MH_FVMLIB },
	{ "Core", MachO::MH_CORE },
	{ "PreloadedExecutable", MachO::MH_PRELOAD },
	{ "DynamicLibrary", MachO::MH_DYLIB },
	{ "DynamicLinker", MachO::MH_DYLINKER },
	{ "Bundle", MachO::MH_BUNDLE },
	{ "DynamicLibraryStub", MachO::MH_DYLIB_STUB },
	{ "DWARFSymbol", MachO::MH_DSYM },
	{ "KextBundle", MachO::MH_KEXT_BUNDLE },
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuTypes[] = {
	{ "Any" , static_cast<uint32_t>(MachO::CPU_TYPE_ANY) },
	{ "X86" , MachO::CPU_TYPE_X86 },
	{ "X86-64" , MachO::CPU_TYPE_X86_64 },
	{ "Mc98000" , MachO::CPU_TYPE_MC98000 },
	{ "Arm" , MachO::CPU_TYPE_ARM },
	{ "Arm64" , MachO::CPU_TYPE_ARM64 },
	{ "Sparc" , MachO::CPU_TYPE_SPARC },
	{ "PowerPC" , MachO::CPU_TYPE_POWERPC },
	{ "PowerPC64" , MachO::CPU_TYPE_POWERPC64 },
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX86[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_I386_ALL),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_386),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_486),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_486SX),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_586),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTPRO),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTII_M3),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTII_M5),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_CELERON),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_CELERON_MOBILE),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_3),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_3_M),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_3_XEON),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_M),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_4),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_4_M),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ITANIUM),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ITANIUM_2),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_XEON),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_XEON_MP),
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX64[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_64_ALL),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_ARCH1),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_64_H),
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_ALL),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V4T),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V6),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V5),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V5TEJ),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_XSCALE),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7S),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7K),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V6M),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7M),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7EM),
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM64[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_ALL),
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesSPARC[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_SPARC_ALL),
	};

	static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesPPC[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_ALL),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_601),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_602),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_603),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_603e),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_603ev),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_604),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_604e),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_620),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_750),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_7400),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_7450),
	LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_970),
	};

	static const EnumEntry<uint32_t> MachOHeaderFlags[] = {
	LLVM_READOBJ_ENUM_ENT(MachO, MH_NOUNDEFS),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_INCRLINK),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_DYLDLINK),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_BINDATLOAD),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_PREBOUND),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_SPLIT_SEGS),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_LAZY_INIT),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_TWOLEVEL),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_FORCE_FLAT),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_NOMULTIDEFS),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_NOFIXPREBINDING),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_PREBINDABLE),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_ALLMODSBOUND),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_SUBSECTIONS_VIA_SYMBOLS),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_CANONICAL),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_WEAK_DEFINES),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_BINDS_TO_WEAK),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_ALLOW_STACK_EXECUTION),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_ROOT_SAFE),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_SETUID_SAFE),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_NO_REEXPORTED_DYLIBS),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_PIE),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_DEAD_STRIPPABLE_DYLIB),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_HAS_TLV_DESCRIPTORS),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_NO_HEAP_EXECUTION),
	LLVM_READOBJ_ENUM_ENT(MachO, MH_APP_EXTENSION_SAFE),
	};

	static const EnumEntry<unsigned> MachOSectionAttributes[] = {
	{ "LocReloc" , 1 << 0 /S_ATTR_LOC_RELOC / },
	{ "ExtReloc" , 1 << 1 /S_ATTR_EXT_RELOC / },
	{ "SomeInstructions" , 1 << 2 /S_ATTR_SOME_INSTRUCTIONS / },
	{ "Debug" , 1 << 17 /S_ATTR_DEBUG / },
	{ "SelfModifyingCode", 1 << 18 /S_ATTR_SELF_MODIFYING_CODE/ },
	{ "LiveSupport" , 1 << 19 /S_ATTR_LIVE_SUPPORT / },
	{ "NoDeadStrip" , 1 << 20 /S_ATTR_NO_DEAD_STRIP / },
	{ "StripStaticSyms" , 1 << 21 /S_ATTR_STRIP_STATIC_SYMS / },
	{ "NoTOC" , 1 << 22 /S_ATTR_NO_TOC / },
	{ "PureInstructions" , 1 << 23 /S_ATTR_PURE_INSTRUCTIONS / },
	};

	static const EnumEntry<unsigned> MachOSymbolRefTypes[] = {
	{ "UndefinedNonLazy", 0 },
	{ "ReferenceFlagUndefinedLazy", 1 },
	{ "ReferenceFlagDefined", 2 },
	{ "ReferenceFlagPrivateDefined", 3 },
	{ "ReferenceFlagPrivateUndefinedNonLazy", 4 },
	{ "ReferenceFlagPrivateUndefinedLazy", 5 }
	};

	static const EnumEntry<unsigned> MachOSymbolFlags[] = {
	{ "ReferencedDynamically", 0x10 },
	{ "NoDeadStrip", 0x20 },
	{ "WeakRef", 0x40 },
	{ "WeakDef", 0x80 },
	{ "AltEntry", 0x200 },
	};

	static const EnumEntry<unsigned> MachOSymbolTypes[] = {
	{ "Undef", 0x0 },
	{ "Abs", 0x2 },
	{ "Indirect", 0xA },
	{ "PreboundUndef", 0xC },
	{ "Section", 0xE }
	};

	namespace {
	struct MachOSection {
	ArrayRef<char> Name;
	ArrayRef<char> SegmentName;
	uint64_t Address;
	uint64_t Size;
	uint32_t Offset;
	uint32_t Alignment;
	uint32_t RelocationTableOffset;
	uint32_t NumRelocationTableEntries;
	uint32_t Flags;
	uint32_t Reserved1;
	uint32_t Reserved2;
	uint32_t Reserved3;
	};

	struct MachOSegment {
	std::string CmdName;
	std::string SegName;
	uint64_t cmdsize;
	uint64_t vmaddr;
	uint64_t vmsize;
	uint64_t fileoff;
	uint64_t filesize;
	uint32_t maxprot;
	uint32_t initprot;
	uint32_t nsects;
	uint32_t flags;
	};

	struct MachOSymbol {
	uint32_t StringIndex;
	uint8_t Type;
	uint8_t SectionIndex;
	uint16_t Flags;
	uint64_t Value;
	};
	}

	static std::string getMask(uint32_t prot)
	{
	// TODO (davide): This always assumes prot is valid.
	// Catch mistakes and report if needed.
	std::string Prot;
	Prot = "";
	Prot += (prot & MachO::VM_PROT_READ) ? "r" : "-";
	Prot += (prot & MachO::VM_PROT_WRITE) ? "w" : "-";
	Prot += (prot & MachO::VM_PROT_EXECUTE) ? "x" : "-";
	return Prot;
	}

	static void getSection(const MachOObjectFile *Obj,
	DataRefImpl Sec,
	MachOSection &Section) {
	if (!Obj->is64Bit()) {
	MachO::section Sect = Obj->getSection(Sec);
	Section.Address = Sect.addr;
	Section.Size = Sect.size;
	Section.Offset = Sect.offset;
	Section.Alignment = Sect.align;
	Section.RelocationTableOffset = Sect.reloff;
	Section.NumRelocationTableEntries = Sect.nreloc;
	Section.Flags = Sect.flags;
	Section.Reserved1 = Sect.reserved1;
	Section.Reserved2 = Sect.reserved2;
	return;
	}
	MachO::section_64 Sect = Obj->getSection64(Sec);
	Section.Address = Sect.addr;
	Section.Size = Sect.size;
	Section.Offset = Sect.offset;
	Section.Alignment = Sect.align;
	Section.RelocationTableOffset = Sect.reloff;
	Section.NumRelocationTableEntries = Sect.nreloc;
	Section.Flags = Sect.flags;
	Section.Reserved1 = Sect.reserved1;
	Section.Reserved2 = Sect.reserved2;
	Section.Reserved3 = Sect.reserved3;
	}

	static void getSegment(const MachOObjectFile *Obj,
	const MachOObjectFile::LoadCommandInfo &L,
	MachOSegment &Segment) {
	if (!Obj->is64Bit()) {
	MachO::segment_command SC = Obj->getSegmentLoadCommand(L);
	Segment.CmdName = "LC_SEGMENT";
	Segment.SegName = SC.segname;
	Segment.cmdsize = SC.cmdsize;
	Segment.vmaddr = SC.vmaddr;
	Segment.vmsize = SC.vmsize;
	Segment.fileoff = SC.fileoff;
	Segment.filesize = SC.filesize;
	Segment.maxprot = SC.maxprot;
	Segment.initprot = SC.initprot;
	Segment.nsects = SC.nsects;
	Segment.flags = SC.flags;
	return;
	}
	MachO::segment_command_64 SC = Obj->getSegment64LoadCommand(L);
	Segment.CmdName = "LC_SEGMENT_64";
	Segment.SegName = SC.segname;
	Segment.cmdsize = SC.cmdsize;
	Segment.vmaddr = SC.vmaddr;
	Segment.vmsize = SC.vmsize;
	Segment.fileoff = SC.fileoff;
	Segment.filesize = SC.filesize;
	Segment.maxprot = SC.maxprot;
	Segment.initprot = SC.initprot;
	Segment.nsects = SC.nsects;
	Segment.flags = SC.flags;
	}

	static void getSymbol(const MachOObjectFile *Obj,
	DataRefImpl DRI,
	MachOSymbol &Symbol) {
	if (!Obj->is64Bit()) {
	MachO::nlist Entry = Obj->getSymbolTableEntry(DRI);
	Symbol.StringIndex = Entry.n_strx;
	Symbol.Type = Entry.n_type;
	Symbol.SectionIndex = Entry.n_sect;
	Symbol.Flags = Entry.n_desc;
	Symbol.Value = Entry.n_value;
	return;
	}
	MachO::nlist_64 Entry = Obj->getSymbol64TableEntry(DRI);
	Symbol.StringIndex = Entry.n_strx;
	Symbol.Type = Entry.n_type;
	Symbol.SectionIndex = Entry.n_sect;
	Symbol.Flags = Entry.n_desc;
	Symbol.Value = Entry.n_value;
	}

	void MachODumper::printFileHeaders() {
	DictScope H(W, "MachHeader");
	if (!Obj->is64Bit()) {
	printFileHeaders(Obj->getHeader());
	} else {
	printFileHeaders(Obj->getHeader64());
	W.printHex("Reserved", Obj->getHeader64().reserved);
	}
	}

	template<class MachHeader>
	void MachODumper::printFileHeaders(const MachHeader &Header) {
	W.printEnum("Magic", Header.magic, makeArrayRef(MachOMagics));
	W.printEnum("CpuType", Header.cputype, makeArrayRef(MachOHeaderCpuTypes));
	uint32_t subtype = Header.cpusubtype & ~MachO::CPU_SUBTYPE_MASK;
	switch (Header.cputype) {
	case MachO::CPU_TYPE_X86:
	W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesX86));
	break;
	case MachO::CPU_TYPE_X86_64:
	W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesX64));
	break;
	case MachO::CPU_TYPE_ARM:
	W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesARM));
	break;
	case MachO::CPU_TYPE_POWERPC:
	W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesPPC));
	break;
	case MachO::CPU_TYPE_SPARC:
	W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesSPARC));
	break;
	case MachO::CPU_TYPE_ARM64:
	W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesARM64));
	break;
	case MachO::CPU_TYPE_POWERPC64:
	default:
	W.printHex("CpuSubtype", subtype);
	}
	W.printEnum("FileType", Header.filetype, makeArrayRef(MachOHeaderFileTypes));
	W.printNumber("NumOfLoadCommands", Header.ncmds);
	W.printNumber("SizeOfLoadCommands", Header.sizeofcmds);
	W.printFlags("Flags", Header.flags, makeArrayRef(MachOHeaderFlags));
	}

	void MachODumper::printSections() {
	return printSections(Obj);
	}

	void MachODumper::printSections(const MachOObjectFile *Obj) {
	ListScope Group(W, "Sections");

	int SectionIndex = -1;
	for (const SectionRef &Section : Obj->sections()) {
	++SectionIndex;

	MachOSection MOSection;
	getSection(Obj, Section.getRawDataRefImpl(), MOSection);
	DataRefImpl DR = Section.getRawDataRefImpl();

	StringRef Name;
	error(Section.getName(Name));

	ArrayRef<char> RawName = Obj->getSectionRawName(DR);
	StringRef SegmentName = Obj->getSectionFinalSegmentName(DR);
	ArrayRef<char> RawSegmentName = Obj->getSectionRawFinalSegmentName(DR);

	DictScope SectionD(W, "Section");
	W.printNumber("Index", SectionIndex);
	W.printBinary("Name", Name, RawName);
	W.printBinary("Segment", SegmentName, RawSegmentName);
	W.printHex("Address", MOSection.Address);
	W.printHex("Size", MOSection.Size);
	W.printNumber("Offset", MOSection.Offset);
	W.printNumber("Alignment", MOSection.Alignment);
	W.printHex("RelocationOffset", MOSection.RelocationTableOffset);
	W.printNumber("RelocationCount", MOSection.NumRelocationTableEntries);
	W.printEnum("Type", MOSection.Flags & 0xFF,
	makeArrayRef(MachOSectionAttributes));
	W.printFlags("Attributes", MOSection.Flags >> 8,
	makeArrayRef(MachOSectionAttributes));
	W.printHex("Reserved1", MOSection.Reserved1);
	W.printHex("Reserved2", MOSection.Reserved2);
	if (Obj->is64Bit())
	W.printHex("Reserved3", MOSection.Reserved3);

	if (opts::SectionRelocations) {
	ListScope D(W, "Relocations");
	for (const RelocationRef &Reloc : Section.relocations())
	printRelocation(Reloc);
	}

	if (opts::SectionSymbols) {
	ListScope D(W, "Symbols");
	for (const SymbolRef &Symbol : Obj->symbols()) {
	if (!Section.containsSymbol(Symbol))
	continue;

	printSymbol(Symbol);
	}
	}

	if (opts::SectionData) {
	bool IsBSS = Section.isBSS();
	if (!IsBSS) {
	StringRef Data;
	error(Section.getContents(Data));

	W.printBinaryBlock("SectionData", Data);
	}
	}
	}
	}

	void MachODumper::printRelocations() {
	ListScope D(W, "Relocations");

	std::error_code EC;
	for (const SectionRef &Section : Obj->sections()) {
	StringRef Name;
	error(Section.getName(Name));

	bool PrintedGroup = false;
	for (const RelocationRef &Reloc : Section.relocations()) {
	if (!PrintedGroup) {
	W.startLine() << "Section " << Name << " {\n";
	W.indent();
	PrintedGroup = true;
	}

	printRelocation(Reloc);
	}

	if (PrintedGroup) {
	W.unindent();
	W.startLine() << "}\n";
	}
	}
	}

	void MachODumper::printRelocation(const RelocationRef &Reloc) {
	return printRelocation(Obj, Reloc);
	}

	void MachODumper::printRelocation(const MachOObjectFile *Obj,
	const RelocationRef &Reloc) {
	uint64_t Offset = Reloc.getOffset();
	SmallString<32> RelocName;
	Reloc.getTypeName(RelocName);

	DataRefImpl DR = Reloc.getRawDataRefImpl();
	MachO::any_relocation_info RE = Obj->getRelocation(DR);
	bool IsScattered = Obj->isRelocationScattered(RE);
	bool IsExtern = !IsScattered && Obj->getPlainRelocationExternal(RE);

	StringRef TargetName;
	if (IsExtern) {
	symbol_iterator Symbol = Reloc.getSymbol();
	if (Symbol != Obj->symbol_end()) {
	Expected<StringRef> TargetNameOrErr = Symbol->getName();
	if (!TargetNameOrErr)
	error(errorToErrorCode(TargetNameOrErr.takeError()));
	TargetName = *TargetNameOrErr;
	}
	} else if (!IsScattered) {
	section_iterator SecI = Obj->getRelocationSection(DR);
	if (SecI != Obj->section_end()) {
	error(SecI->getName(TargetName));
	}
	}
	if (TargetName.empty())
	TargetName = "-";

	if (opts::ExpandRelocs) {
	DictScope Group(W, "Relocation");
	W.printHex("Offset", Offset);
	W.printNumber("PCRel", Obj->getAnyRelocationPCRel(RE));
	W.printNumber("Length", Obj->getAnyRelocationLength(RE));
	W.printNumber("Type", RelocName, Obj->getAnyRelocationType(RE));
	if (IsScattered) {
	W.printHex("Value", Obj->getScatteredRelocationValue(RE));
	} else {
	const char *Kind = IsExtern ? "Symbol" : "Section";
	W.printNumber(Kind, TargetName, Obj->getPlainRelocationSymbolNum(RE));
	}
	} else {
	SmallString<32> SymbolNameOrOffset("0x");
	if (IsScattered) {
	// Scattered relocations don't really have an associated symbol for some
	// reason, even if one exists in the symtab at the correct address.
	SymbolNameOrOffset += utohexstr(Obj->getScatteredRelocationValue(RE));
	} else {
	SymbolNameOrOffset = TargetName;
	}

	raw_ostream& OS = W.startLine();
	OS << W.hex(Offset)
	<< " " << Obj->getAnyRelocationPCRel(RE)
	<< " " << Obj->getAnyRelocationLength(RE);
	if (IsScattered)
	OS << " n/a";
	else
	OS << " " << Obj->getPlainRelocationExternal(RE);
	OS << " " << RelocName
	<< " " << IsScattered
	<< " " << SymbolNameOrOffset
	<< "\n";
	}
	}

	void MachODumper::printSymbols() {
	ListScope Group(W, "Symbols");

	for (const SymbolRef &Symbol : Obj->symbols()) {
	printSymbol(Symbol);
	}
	}

	void MachODumper::printDynamicSymbols() {
	ListScope Group(W, "DynamicSymbols");
	}

	void MachODumper::printSymbol(const SymbolRef &Symbol) {
	StringRef SymbolName;
	Expected<StringRef> SymbolNameOrErr = Symbol.getName();
	if (!SymbolNameOrErr) {
	// TODO: Actually report errors helpfully.
	consumeError(SymbolNameOrErr.takeError());
	} else
	SymbolName = *SymbolNameOrErr;

	MachOSymbol MOSymbol;
	getSymbol(Obj, Symbol.getRawDataRefImpl(), MOSymbol);

	StringRef SectionName = "";
	Expected<section_iterator> SecIOrErr = Symbol.getSection();
	error(errorToErrorCode(SecIOrErr.takeError()));
	section_iterator SecI = *SecIOrErr;
	if (SecI != Obj->section_end())
	error(SecI->getName(SectionName));

	DictScope D(W, "Symbol");
	W.printNumber("Name", SymbolName, MOSymbol.StringIndex);
	if (MOSymbol.Type & MachO::N_STAB) {
	W.printHex("Type", "SymDebugTable", MOSymbol.Type);
	} else {
	if (MOSymbol.Type & MachO::N_PEXT)
	W.startLine() << "PrivateExtern\n";
	if (MOSymbol.Type & MachO::N_EXT)
	W.startLine() << "Extern\n";
	W.printEnum("Type", uint8_t(MOSymbol.Type & MachO::N_TYPE),
	makeArrayRef(MachOSymbolTypes));
	}
	W.printHex("Section", SectionName, MOSymbol.SectionIndex);
	W.printEnum("RefType", static_cast<uint16_t>(MOSymbol.Flags & 0xF),
	makeArrayRef(MachOSymbolRefTypes));
	W.printFlags("Flags", static_cast<uint16_t>(MOSymbol.Flags & ~0xF),
	makeArrayRef(MachOSymbolFlags));
	W.printHex("Value", MOSymbol.Value);
	}

	void MachODumper::printUnwindInfo() {
	W.startLine() << "UnwindInfo not implemented.\n";
	}

	void MachODumper::printStackMap() const {
	object::SectionRef StackMapSection;
	for (auto Sec : Obj->sections()) {
	StringRef Name;
	Sec.getName(Name);
	if (Name == "__llvm_stackmaps") {
	StackMapSection = Sec;
	break;
	}
	}

	if (StackMapSection == object::SectionRef())
	return;

	StringRef StackMapContents;
	StackMapSection.getContents(StackMapContents);
	ArrayRef<uint8_t> StackMapContentsArray(
	reinterpret_cast<const uint8_t*>(StackMapContents.data()),
	StackMapContents.size());

	if (Obj->isLittleEndian())
	prettyPrintStackMap(
	llvm::outs(),
	StackMapV2Parser<support::little>(StackMapContentsArray));
	else
	prettyPrintStackMap(llvm::outs(),
	StackMapV2Parser<support::big>(StackMapContentsArray));
	+}
	+
	+void MachODumper::printNeededLibraries() {
	+ ListScope D(W, "NeededLibraries");
	+
	+ using LibsTy = std::vector<StringRef>;
	+ LibsTy Libs;
	+
	+ for (const auto &Command : Obj->load_commands()) {
	+ if (Command.C.cmd == MachO::LC_LOAD_DYLIB \|\|
	+ Command.C.cmd == MachO::LC_ID_DYLIB \|\|
	+ Command.C.cmd == MachO::LC_LOAD_WEAK_DYLIB \|\|
	+ Command.C.cmd == MachO::LC_REEXPORT_DYLIB \|\|
	+ Command.C.cmd == MachO::LC_LAZY_LOAD_DYLIB \|\|
	+ Command.C.cmd == MachO::LC_LOAD_UPWARD_DYLIB) {
	+ MachO::dylib_command Dl = Obj->getDylibIDLoadCommand(Command);
	+ if (Dl.dylib.name < Dl.cmdsize) {
	+ auto P = static_cast<const char>(Command.Ptr) + Dl.dylib.name;
	+ Libs.push_back(P);
	+ }
	+ }
	+ }
	+
	+ std::stable_sort(Libs.begin(), Libs.end());
	+
	+ for (const auto &L : Libs) {
	+ outs() << " " << L << "\n";
	+ }
	}

	void MachODumper::printMachODataInCode() {
	for (const auto &Load : Obj->load_commands()) {
	if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
	MachO::linkedit_data_command LLC = Obj->getLinkeditDataLoadCommand(Load);
	DictScope Group(W, "DataInCode");
	W.printNumber("Data offset", LLC.dataoff);
	W.printNumber("Data size", LLC.datasize);
	ListScope D(W, "Data entries");
	unsigned NumRegions = LLC.datasize / sizeof(MachO::data_in_code_entry);
	for (unsigned i = 0; i < NumRegions; ++i) {
	MachO::data_in_code_entry DICE = Obj->getDataInCodeTableEntry(
	LLC.dataoff, i);
	DictScope Group(W, "Entry");
	W.printNumber("Index", i);
	W.printNumber("Offset", DICE.offset);
	W.printNumber("Length", DICE.length);
	W.printNumber("Kind", DICE.kind);
	}
	}
	}
	}

	void MachODumper::printMachOVersionMin() {
	for (const auto &Load : Obj->load_commands()) {
	StringRef Cmd;
	switch (Load.C.cmd) {
	case MachO::LC_VERSION_MIN_MACOSX:
	Cmd = "LC_VERSION_MIN_MACOSX";
	break;
	case MachO::LC_VERSION_MIN_IPHONEOS:
	Cmd = "LC_VERSION_MIN_IPHONEOS";
	break;
	case MachO::LC_VERSION_MIN_TVOS:
	Cmd = "LC_VERSION_MIN_TVOS";
	break;
	case MachO::LC_VERSION_MIN_WATCHOS:
	Cmd = "LC_VERSION_MIN_WATCHOS";
	break;
	case MachO::LC_BUILD_VERSION:
	Cmd = "LC_BUILD_VERSION";
	break;
	default:
	continue;
	}

	DictScope Group(W, "MinVersion");
	// Handle LC_BUILD_VERSION.
	if (Load.C.cmd == MachO::LC_BUILD_VERSION) {
	MachO::build_version_command BVC = Obj->getBuildVersionLoadCommand(Load);
	W.printString("Cmd", Cmd);
	W.printNumber("Size", BVC.cmdsize);
	W.printString("Platform",
	MachOObjectFile::getBuildPlatform(BVC.platform));
	W.printString("Version", MachOObjectFile::getVersionString(BVC.minos));
	if (BVC.sdk)
	W.printString("SDK", MachOObjectFile::getVersionString(BVC.sdk));
	else
	W.printString("SDK", StringRef("n/a"));
	continue;
	}

	MachO::version_min_command VMC = Obj->getVersionMinLoadCommand(Load);
	W.printString("Cmd", Cmd);
	W.printNumber("Size", VMC.cmdsize);
	SmallString<32> Version;
	Version = utostr(MachOObjectFile::getVersionMinMajor(VMC, false)) + "." +
	utostr(MachOObjectFile::getVersionMinMinor(VMC, false));
	uint32_t Update = MachOObjectFile::getVersionMinUpdate(VMC, false);
	if (Update != 0)
	Version += "." + utostr(MachOObjectFile::getVersionMinUpdate(VMC, false));
	W.printString("Version", Version);
	SmallString<32> SDK;
	if (VMC.sdk == 0)
	SDK = "n/a";
	else {
	SDK = utostr(MachOObjectFile::getVersionMinMajor(VMC, true)) + "." +
	utostr(MachOObjectFile::getVersionMinMinor(VMC, true));
	uint32_t Update = MachOObjectFile::getVersionMinUpdate(VMC, true);
	if (Update != 0)
	SDK += "." + utostr(MachOObjectFile::getVersionMinUpdate(VMC, true));
	}
	W.printString("SDK", SDK);
	}
	}

	void MachODumper::printMachODysymtab() {
	for (const auto &Load : Obj->load_commands()) {
	if (Load.C.cmd == MachO::LC_DYSYMTAB) {
	MachO::dysymtab_command DLC = Obj->getDysymtabLoadCommand();
	DictScope Group(W, "Dysymtab");
	W.printNumber("ilocalsym", DLC.ilocalsym);
	W.printNumber("nlocalsym", DLC.nlocalsym);
	W.printNumber("iextdefsym", DLC.iextdefsym);
	W.printNumber("nextdefsym", DLC.nextdefsym);
	W.printNumber("iundefsym", DLC.iundefsym);
	W.printNumber("nundefsym", DLC.nundefsym);
	W.printNumber("tocoff", DLC.tocoff);
	W.printNumber("ntoc", DLC.ntoc);
	W.printNumber("modtaboff", DLC.modtaboff);
	W.printNumber("nmodtab", DLC.nmodtab);
	W.printNumber("extrefsymoff", DLC.extrefsymoff);
	W.printNumber("nextrefsyms", DLC.nextrefsyms);
	W.printNumber("indirectsymoff", DLC.indirectsymoff);
	W.printNumber("nindirectsyms", DLC.nindirectsyms);
	W.printNumber("extreloff", DLC.extreloff);
	W.printNumber("nextrel", DLC.nextrel);
	W.printNumber("locreloff", DLC.locreloff);
	W.printNumber("nlocrel", DLC.nlocrel);
	}
	}
	}

	void MachODumper::printMachOSegment() {
	for (const auto &Load : Obj->load_commands()) {
	if (Load.C.cmd == MachO::LC_SEGMENT \|\| Load.C.cmd == MachO::LC_SEGMENT_64) {
	MachOSegment MOSegment;
	getSegment(Obj, Load, MOSegment);
	DictScope Group(W, "Segment");
	W.printString("Cmd", MOSegment.CmdName);
	W.printString("Name", MOSegment.SegName);
	W.printNumber("Size", MOSegment.cmdsize);
	W.printHex("vmaddr", MOSegment.vmaddr);
	W.printHex("vmsize", MOSegment.vmsize);
	W.printNumber("fileoff", MOSegment.fileoff);
	W.printNumber("filesize", MOSegment.filesize);
	W.printString("maxprot", getMask(MOSegment.maxprot));
	W.printString("initprot", getMask(MOSegment.initprot));
	W.printNumber("nsects", MOSegment.nsects);
	W.printHex("flags", MOSegment.flags);
	}
	}
	}

	void MachODumper::printMachOIndirectSymbols() {
	for (const auto &Load : Obj->load_commands()) {
	if (Load.C.cmd == MachO::LC_DYSYMTAB) {
	MachO::dysymtab_command DLC = Obj->getDysymtabLoadCommand();
	DictScope Group(W, "Indirect Symbols");
	W.printNumber("Number", DLC.nindirectsyms);
	ListScope D(W, "Symbols");
	for (unsigned i = 0; i < DLC.nindirectsyms; ++i) {
	DictScope Group(W, "Entry");
	W.printNumber("Entry Index", i);
	W.printHex("Symbol Index", Obj->getIndirectSymbolTableEntry(DLC, i));
	}
	}
	}
	}

	void MachODumper::printMachOLinkerOptions() {
	for (const auto &Load : Obj->load_commands()) {
	if (Load.C.cmd == MachO::LC_LINKER_OPTION) {
	MachO::linker_option_command LOLC = Obj->getLinkerOptionLoadCommand(Load);
	DictScope Group(W, "Linker Options");
	W.printNumber("Size", LOLC.cmdsize);
	ListScope D(W, "Strings");
	uint64_t DataSize = LOLC.cmdsize - sizeof(MachO::linker_option_command);
	const char *P = Load.Ptr + sizeof(MachO::linker_option_command);
	StringRef Data(P, DataSize);
	for (unsigned i = 0; i < LOLC.count; ++i) {
	std::pair<StringRef,StringRef> Split = Data.split('\0');
	W.printString("Value", Split.first);
	Data = Split.second;
	}
	}
	}
	}
	Index: vendor/llvm/dist-release_60/unittests/IR/DominatorTreeBatchUpdatesTest.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/unittests/IR/DominatorTreeBatchUpdatesTest.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/unittests/IR/DominatorTreeBatchUpdatesTest.cpp (revision 328362)
	@@ -1,260 +1,355 @@
	//===- llvm/unittests/IR/DominatorTreeBatchUpdatesTest.cpp ----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include <random>
	#include "CFGBuilder.h"
	#include "gtest/gtest.h"
	#include "llvm/Analysis/PostDominators.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/Support/GenericDomTreeConstruction.h"

	#define DEBUG_TYPE "batch-update-tests"

	using namespace llvm;

	namespace {
	const auto CFGInsert = CFGBuilder::ActionKind::Insert;
	const auto CFGDelete = CFGBuilder::ActionKind::Delete;

	struct PostDomTree : PostDomTreeBase<BasicBlock> {
	PostDomTree(Function &F) { recalculate(F); }
	};

	using DomUpdate = DominatorTree::UpdateType;
	static_assert(
	std::is_same<DomUpdate, PostDomTree::UpdateType>::value,
	"Trees differing only in IsPostDom should have the same update types");
	using DomSNCA = DomTreeBuilder::SemiNCAInfo<DomTreeBuilder::BBDomTree>;
	using PostDomSNCA = DomTreeBuilder::SemiNCAInfo<DomTreeBuilder::BBPostDomTree>;
	const auto Insert = DominatorTree::Insert;
	const auto Delete = DominatorTree::Delete;

	std::vector<DomUpdate> ToDomUpdates(CFGBuilder &B,
	std::vector<CFGBuilder::Update> &In) {
	std::vector<DomUpdate> Res;
	Res.reserve(In.size());

	for (const auto &CFGU : In)
	Res.push_back({CFGU.Action == CFGInsert ? Insert : Delete,
	B.getOrAddBlock(CFGU.Edge.From),
	B.getOrAddBlock(CFGU.Edge.To)});
	return Res;
	}
	} // namespace

	TEST(DominatorTreeBatchUpdates, LegalizeDomUpdates) {
	CFGHolder Holder;
	CFGBuilder Builder(Holder.F, {{"A", "B"}}, {});

	BasicBlock *A = Builder.getOrAddBlock("A");
	BasicBlock *B = Builder.getOrAddBlock("B");
	BasicBlock *C = Builder.getOrAddBlock("C");
	BasicBlock *D = Builder.getOrAddBlock("D");

	std::vector<DomUpdate> Updates = {
	{Insert, B, C}, {Insert, C, D}, {Delete, B, C}, {Insert, B, C},
	{Insert, B, D}, {Delete, C, D}, {Delete, A, B}};
	SmallVector<DomUpdate, 4> Legalized;
	DomSNCA::LegalizeUpdates(Updates, Legalized);
	DEBUG(dbgs() << "Legalized updates:\t");
	DEBUG(for (auto &U : Legalized) dbgs() << U << ", ");
	DEBUG(dbgs() << "\n");
	EXPECT_EQ(Legalized.size(), 3UL);
	EXPECT_NE(llvm::find(Legalized, DomUpdate{Insert, B, C}), Legalized.end());
	EXPECT_NE(llvm::find(Legalized, DomUpdate{Insert, B, D}), Legalized.end());
	EXPECT_NE(llvm::find(Legalized, DomUpdate{Delete, A, B}), Legalized.end());
	}

	TEST(DominatorTreeBatchUpdates, LegalizePostDomUpdates) {
	CFGHolder Holder;
	CFGBuilder Builder(Holder.F, {{"A", "B"}}, {});

	BasicBlock *A = Builder.getOrAddBlock("A");
	BasicBlock *B = Builder.getOrAddBlock("B");
	BasicBlock *C = Builder.getOrAddBlock("C");
	BasicBlock *D = Builder.getOrAddBlock("D");

	std::vector<DomUpdate> Updates = {
	{Insert, B, C}, {Insert, C, D}, {Delete, B, C}, {Insert, B, C},
	{Insert, B, D}, {Delete, C, D}, {Delete, A, B}};
	SmallVector<DomUpdate, 4> Legalized;
	PostDomSNCA::LegalizeUpdates(Updates, Legalized);
	DEBUG(dbgs() << "Legalized postdom updates:\t");
	DEBUG(for (auto &U : Legalized) dbgs() << U << ", ");
	DEBUG(dbgs() << "\n");
	EXPECT_EQ(Legalized.size(), 3UL);
	EXPECT_NE(llvm::find(Legalized, DomUpdate{Insert, C, B}), Legalized.end());
	EXPECT_NE(llvm::find(Legalized, DomUpdate{Insert, D, B}), Legalized.end());
	EXPECT_NE(llvm::find(Legalized, DomUpdate{Delete, B, A}), Legalized.end());
	}

	TEST(DominatorTreeBatchUpdates, SingleInsertion) {
	CFGHolder Holder;
	CFGBuilder Builder(Holder.F, {{"A", "B"}}, {{CFGInsert, {"B", "C"}}});

	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(DT.verify());

	BasicBlock *B = Builder.getOrAddBlock("B");
	BasicBlock *C = Builder.getOrAddBlock("C");
	std::vector<DomUpdate> Updates = {{Insert, B, C}};

	ASSERT_TRUE(Builder.applyUpdate());

	DT.applyUpdates(Updates);
	EXPECT_TRUE(DT.verify());
	PDT.applyUpdates(Updates);
	EXPECT_TRUE(PDT.verify());
	}

	TEST(DominatorTreeBatchUpdates, SingleDeletion) {
	CFGHolder Holder;
	CFGBuilder Builder(Holder.F, {{"A", "B"}, {"B", "C"}},
	{{CFGDelete, {"B", "C"}}});

	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(DT.verify());

	BasicBlock *B = Builder.getOrAddBlock("B");
	BasicBlock *C = Builder.getOrAddBlock("C");
	std::vector<DomUpdate> Updates = {{Delete, B, C}};

	ASSERT_TRUE(Builder.applyUpdate());

	DT.applyUpdates(Updates);
	EXPECT_TRUE(DT.verify());
	PDT.applyUpdates(Updates);
	EXPECT_TRUE(PDT.verify());
	}

	TEST(DominatorTreeBatchUpdates, FewInsertion) {
	std::vector<CFGBuilder::Update> CFGUpdates = {{CFGInsert, {"B", "C"}},
	{CFGInsert, {"C", "B"}},
	{CFGInsert, {"C", "D"}},
	{CFGInsert, {"D", "E"}}};

	CFGHolder Holder;
	CFGBuilder Builder(Holder.F, {{"A", "B"}}, CFGUpdates);

	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	BasicBlock *B = Builder.getOrAddBlock("B");
	BasicBlock *C = Builder.getOrAddBlock("C");
	BasicBlock *D = Builder.getOrAddBlock("D");
	BasicBlock *E = Builder.getOrAddBlock("E");

	std::vector<DomUpdate> Updates = {
	{Insert, B, C}, {Insert, C, B}, {Insert, C, D}, {Insert, D, E}};

	while (Builder.applyUpdate())
	;

	DT.applyUpdates(Updates);
	EXPECT_TRUE(DT.verify());
	PDT.applyUpdates(Updates);
	EXPECT_TRUE(PDT.verify());
	}

	TEST(DominatorTreeBatchUpdates, FewDeletions) {
	std::vector<CFGBuilder::Update> CFGUpdates = {{CFGDelete, {"B", "C"}},
	{CFGDelete, {"C", "B"}},
	{CFGDelete, {"B", "D"}},
	{CFGDelete, {"D", "E"}}};

	CFGHolder Holder;
	CFGBuilder Builder(
	Holder.F, {{"A", "B"}, {"B", "C"}, {"B", "D"}, {"D", "E"}, {"C", "B"}},
	CFGUpdates);

	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	auto Updates = ToDomUpdates(Builder, CFGUpdates);

	while (Builder.applyUpdate())
	;

	DT.applyUpdates(Updates);
	EXPECT_TRUE(DT.verify());
	PDT.applyUpdates(Updates);
	EXPECT_TRUE(PDT.verify());
	}

	TEST(DominatorTreeBatchUpdates, InsertDelete) {
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};

	std::vector<CFGBuilder::Update> Updates = {
	{CFGInsert, {"2", "4"}}, {CFGInsert, {"12", "10"}},
	{CFGInsert, {"10", "9"}}, {CFGInsert, {"7", "6"}},
	{CFGInsert, {"7", "5"}}, {CFGDelete, {"3", "8"}},
	{CFGInsert, {"10", "7"}}, {CFGInsert, {"2", "8"}},
	{CFGDelete, {"3", "4"}}, {CFGDelete, {"8", "9"}},
	{CFGDelete, {"11", "12"}}};

	CFGHolder Holder;
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	while (B.applyUpdate())
	;

	auto DomUpdates = ToDomUpdates(B, Updates);
	DT.applyUpdates(DomUpdates);
	EXPECT_TRUE(DT.verify());
	PDT.applyUpdates(DomUpdates);
	EXPECT_TRUE(PDT.verify());
	}

	TEST(DominatorTreeBatchUpdates, InsertDeleteExhaustive) {
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};

	std::vector<CFGBuilder::Update> Updates = {
	{CFGInsert, {"2", "4"}}, {CFGInsert, {"12", "10"}},
	{CFGInsert, {"10", "9"}}, {CFGInsert, {"7", "6"}},
	{CFGInsert, {"7", "5"}}, {CFGDelete, {"3", "8"}},
	{CFGInsert, {"10", "7"}}, {CFGInsert, {"2", "8"}},
	{CFGDelete, {"3", "4"}}, {CFGDelete, {"8", "9"}},
	{CFGDelete, {"11", "12"}}};

	std::mt19937 Generator(0);
	for (unsigned i = 0; i < 16; ++i) {
	std::shuffle(Updates.begin(), Updates.end(), Generator);
	CFGHolder Holder;
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	while (B.applyUpdate())
	;

	auto DomUpdates = ToDomUpdates(B, Updates);
	DT.applyUpdates(DomUpdates);
	EXPECT_TRUE(DT.verify());
	PDT.applyUpdates(DomUpdates);
	EXPECT_TRUE(PDT.verify());
	}
	}
	+
	+// These are some odd flowgraphs, usually generated from csmith cases,
	+// which are difficult on post dom trees.
	+TEST(DominatorTreeBatchUpdates, InfiniteLoop) {
	+ std::vector<CFGBuilder::Arc> Arcs = {
	+ {"1", "2"},
	+ {"2", "3"},
	+ {"3", "6"}, {"3", "5"},
	+ {"4", "5"},
	+ {"5", "2"},
	+ {"6", "3"}, {"6", "4"}};
	+
	+ // SplitBlock on 3 -> 5
	+ std::vector<CFGBuilder::Update> Updates = {
	+ {CFGInsert, {"N", "5"}}, {CFGInsert, {"3", "N"}}, {CFGDelete, {"3", "5"}}};
	+
	+ CFGHolder Holder;
	+ CFGBuilder B(Holder.F, Arcs, Updates);
	+ DominatorTree DT(*Holder.F);
	+ EXPECT_TRUE(DT.verify());
	+ PostDomTree PDT(*Holder.F);
	+ EXPECT_TRUE(PDT.verify());
	+
	+ while (B.applyUpdate())
	+ ;
	+
	+ auto DomUpdates = ToDomUpdates(B, Updates);
	+ DT.applyUpdates(DomUpdates);
	+ EXPECT_TRUE(DT.verify());
	+ PDT.applyUpdates(DomUpdates);
	+ EXPECT_TRUE(PDT.verify());
	+}
	+
	+TEST(DominatorTreeBatchUpdates, DeadBlocks) {
	+ std::vector<CFGBuilder::Arc> Arcs = {
	+ {"1", "2"},
	+ {"2", "3"},
	+ {"3", "4"}, {"3", "7"},
	+ {"4", "4"},
	+ {"5", "6"}, {"5", "7"},
	+ {"6", "7"},
	+ {"7", "2"}, {"7", "8"}};
	+
	+ // Remove dead 5 and 7,
	+ // plus SplitBlock on 7 -> 8
	+ std::vector<CFGBuilder::Update> Updates = {
	+ {CFGDelete, {"6", "7"}}, {CFGDelete, {"5", "7"}}, {CFGDelete, {"5", "6"}},
	+ {CFGInsert, {"N", "8"}}, {CFGInsert, {"7", "N"}}, {CFGDelete, {"7", "8"}}};
	+
	+ CFGHolder Holder;
	+ CFGBuilder B(Holder.F, Arcs, Updates);
	+ DominatorTree DT(*Holder.F);
	+ EXPECT_TRUE(DT.verify());
	+ PostDomTree PDT(*Holder.F);
	+ EXPECT_TRUE(PDT.verify());
	+
	+ while (B.applyUpdate())
	+ ;
	+
	+ auto DomUpdates = ToDomUpdates(B, Updates);
	+ DT.applyUpdates(DomUpdates);
	+ EXPECT_TRUE(DT.verify());
	+ PDT.applyUpdates(DomUpdates);
	+ EXPECT_TRUE(PDT.verify());
	+}
	+
	+TEST(DominatorTreeBatchUpdates, InfiniteLoop2) {
	+ std::vector<CFGBuilder::Arc> Arcs = {
	+ {"1", "2"},
	+ {"2", "6"}, {"2", "3"},
	+ {"3", "4"},
	+ {"4", "5"}, {"4", "6"},
	+ {"5", "4"},
	+ {"6", "2"}};
	+
	+ // SplitBlock on 4 -> 6
	+ std::vector<CFGBuilder::Update> Updates = {
	+ {CFGInsert, {"N", "6"}}, {CFGInsert, {"4", "N"}}, {CFGDelete, {"4", "6"}}};
	+
	+ CFGHolder Holder;
	+ CFGBuilder B(Holder.F, Arcs, Updates);
	+ DominatorTree DT(*Holder.F);
	+ EXPECT_TRUE(DT.verify());
	+ PostDomTree PDT(*Holder.F);
	+ EXPECT_TRUE(PDT.verify());
	+
	+ while (B.applyUpdate())
	+ ;
	+
	+ auto DomUpdates = ToDomUpdates(B, Updates);
	+ DT.applyUpdates(DomUpdates);
	+ EXPECT_TRUE(DT.verify());
	+ PDT.applyUpdates(DomUpdates);
	+ EXPECT_TRUE(PDT.verify());
	+}
	Index: vendor/llvm/dist-release_60/unittests/IR/DominatorTreeTest.cpp
	===================================================================
	--- vendor/llvm/dist-release_60/unittests/IR/DominatorTreeTest.cpp (revision 328361)
	+++ vendor/llvm/dist-release_60/unittests/IR/DominatorTreeTest.cpp (revision 328362)
	@@ -1,927 +1,952 @@
	//===- llvm/unittests/IR/DominatorTreeTest.cpp - Constants unit tests -----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include <random>
	#include "llvm/Analysis/PostDominators.h"
	#include "llvm/AsmParser/Parser.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/SourceMgr.h"
	#include "CFGBuilder.h"
	#include "gtest/gtest.h"

	using namespace llvm;

	struct PostDomTree : PostDomTreeBase<BasicBlock> {
	PostDomTree(Function &F) { recalculate(F); }
	};

	/// Build the dominator tree for the function and run the Test.
	static void runWithDomTree(
	Module &M, StringRef FuncName,
	function_ref<void(Function &F, DominatorTree DT, PostDomTree PDT)> Test) {
	auto *F = M.getFunction(FuncName);
	ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
	// Compute the dominator tree for the function.
	DominatorTree DT(*F);
	PostDomTree PDT(*F);
	Test(*F, &DT, &PDT);
	}

	static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
	StringRef ModuleStr) {
	SMDiagnostic Err;
	std::unique_ptr<Module> M = parseAssemblyString(ModuleStr, Err, Context);
	assert(M && "Bad assembly?");
	return M;
	}

	TEST(DominatorTree, Unreachable) {
	StringRef ModuleString =
	"declare i32 @g()\n"
	"define void @f(i32 %x) personality i32 ()* @g {\n"
	"bb0:\n"
	" %y1 = add i32 %x, 1\n"
	" %y2 = add i32 %x, 1\n"
	" %y3 = invoke i32 @g() to label %bb1 unwind label %bb2\n"
	"bb1:\n"
	" %y4 = add i32 %x, 1\n"
	" br label %bb4\n"
	"bb2:\n"
	" %y5 = landingpad i32\n"
	" cleanup\n"
	" br label %bb4\n"
	"bb3:\n"
	" %y6 = add i32 %x, 1\n"
	" %y7 = add i32 %x, 1\n"
	" ret void\n"
	"bb4:\n"
	" %y8 = phi i32 [0, %bb2], [%y4, %bb1]\n"
	" %y9 = phi i32 [0, %bb2], [%y4, %bb1]\n"
	" ret void\n"
	"}\n";

	// Parse the module.
	LLVMContext Context;
	std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);

	runWithDomTree(
	M, "f", [&](Function &F, DominatorTree DT, PostDomTree *PDT) {
	Function::iterator FI = F.begin();

	BasicBlock BB0 = &FI++;
	BasicBlock::iterator BBI = BB0->begin();
	Instruction Y1 = &BBI++;
	Instruction Y2 = &BBI++;
	Instruction Y3 = &BBI++;

	BasicBlock BB1 = &FI++;
	BBI = BB1->begin();
	Instruction Y4 = &BBI++;

	BasicBlock BB2 = &FI++;
	BBI = BB2->begin();
	Instruction Y5 = &BBI++;

	BasicBlock BB3 = &FI++;
	BBI = BB3->begin();
	Instruction Y6 = &BBI++;
	Instruction Y7 = &BBI++;

	BasicBlock BB4 = &FI++;
	BBI = BB4->begin();
	Instruction Y8 = &BBI++;
	Instruction Y9 = &BBI++;

	// Reachability
	EXPECT_TRUE(DT->isReachableFromEntry(BB0));
	EXPECT_TRUE(DT->isReachableFromEntry(BB1));
	EXPECT_TRUE(DT->isReachableFromEntry(BB2));
	EXPECT_FALSE(DT->isReachableFromEntry(BB3));
	EXPECT_TRUE(DT->isReachableFromEntry(BB4));

	// BB dominance
	EXPECT_TRUE(DT->dominates(BB0, BB0));
	EXPECT_TRUE(DT->dominates(BB0, BB1));
	EXPECT_TRUE(DT->dominates(BB0, BB2));
	EXPECT_TRUE(DT->dominates(BB0, BB3));
	EXPECT_TRUE(DT->dominates(BB0, BB4));

	EXPECT_FALSE(DT->dominates(BB1, BB0));
	EXPECT_TRUE(DT->dominates(BB1, BB1));
	EXPECT_FALSE(DT->dominates(BB1, BB2));
	EXPECT_TRUE(DT->dominates(BB1, BB3));
	EXPECT_FALSE(DT->dominates(BB1, BB4));

	EXPECT_FALSE(DT->dominates(BB2, BB0));
	EXPECT_FALSE(DT->dominates(BB2, BB1));
	EXPECT_TRUE(DT->dominates(BB2, BB2));
	EXPECT_TRUE(DT->dominates(BB2, BB3));
	EXPECT_FALSE(DT->dominates(BB2, BB4));

	EXPECT_FALSE(DT->dominates(BB3, BB0));
	EXPECT_FALSE(DT->dominates(BB3, BB1));
	EXPECT_FALSE(DT->dominates(BB3, BB2));
	EXPECT_TRUE(DT->dominates(BB3, BB3));
	EXPECT_FALSE(DT->dominates(BB3, BB4));

	// BB proper dominance
	EXPECT_FALSE(DT->properlyDominates(BB0, BB0));
	EXPECT_TRUE(DT->properlyDominates(BB0, BB1));
	EXPECT_TRUE(DT->properlyDominates(BB0, BB2));
	EXPECT_TRUE(DT->properlyDominates(BB0, BB3));

	EXPECT_FALSE(DT->properlyDominates(BB1, BB0));
	EXPECT_FALSE(DT->properlyDominates(BB1, BB1));
	EXPECT_FALSE(DT->properlyDominates(BB1, BB2));
	EXPECT_TRUE(DT->properlyDominates(BB1, BB3));

	EXPECT_FALSE(DT->properlyDominates(BB2, BB0));
	EXPECT_FALSE(DT->properlyDominates(BB2, BB1));
	EXPECT_FALSE(DT->properlyDominates(BB2, BB2));
	EXPECT_TRUE(DT->properlyDominates(BB2, BB3));

	EXPECT_FALSE(DT->properlyDominates(BB3, BB0));
	EXPECT_FALSE(DT->properlyDominates(BB3, BB1));
	EXPECT_FALSE(DT->properlyDominates(BB3, BB2));
	EXPECT_FALSE(DT->properlyDominates(BB3, BB3));

	// Instruction dominance in the same reachable BB
	EXPECT_FALSE(DT->dominates(Y1, Y1));
	EXPECT_TRUE(DT->dominates(Y1, Y2));
	EXPECT_FALSE(DT->dominates(Y2, Y1));
	EXPECT_FALSE(DT->dominates(Y2, Y2));

	// Instruction dominance in the same unreachable BB
	EXPECT_TRUE(DT->dominates(Y6, Y6));
	EXPECT_TRUE(DT->dominates(Y6, Y7));
	EXPECT_TRUE(DT->dominates(Y7, Y6));
	EXPECT_TRUE(DT->dominates(Y7, Y7));

	// Invoke
	EXPECT_TRUE(DT->dominates(Y3, Y4));
	EXPECT_FALSE(DT->dominates(Y3, Y5));

	// Phi
	EXPECT_TRUE(DT->dominates(Y2, Y9));
	EXPECT_FALSE(DT->dominates(Y3, Y9));
	EXPECT_FALSE(DT->dominates(Y8, Y9));

	// Anything dominates unreachable
	EXPECT_TRUE(DT->dominates(Y1, Y6));
	EXPECT_TRUE(DT->dominates(Y3, Y6));

	// Unreachable doesn't dominate reachable
	EXPECT_FALSE(DT->dominates(Y6, Y1));

	// Instruction, BB dominance
	EXPECT_FALSE(DT->dominates(Y1, BB0));
	EXPECT_TRUE(DT->dominates(Y1, BB1));
	EXPECT_TRUE(DT->dominates(Y1, BB2));
	EXPECT_TRUE(DT->dominates(Y1, BB3));
	EXPECT_TRUE(DT->dominates(Y1, BB4));

	EXPECT_FALSE(DT->dominates(Y3, BB0));
	EXPECT_TRUE(DT->dominates(Y3, BB1));
	EXPECT_FALSE(DT->dominates(Y3, BB2));
	EXPECT_TRUE(DT->dominates(Y3, BB3));
	EXPECT_FALSE(DT->dominates(Y3, BB4));

	EXPECT_TRUE(DT->dominates(Y6, BB3));

	// Post dominance.
	EXPECT_TRUE(PDT->dominates(BB0, BB0));
	EXPECT_FALSE(PDT->dominates(BB1, BB0));
	EXPECT_FALSE(PDT->dominates(BB2, BB0));
	EXPECT_FALSE(PDT->dominates(BB3, BB0));
	EXPECT_TRUE(PDT->dominates(BB4, BB1));

	// Dominance descendants.
	SmallVector<BasicBlock *, 8> DominatedBBs, PostDominatedBBs;

	DT->getDescendants(BB0, DominatedBBs);
	PDT->getDescendants(BB0, PostDominatedBBs);
	EXPECT_EQ(DominatedBBs.size(), 4UL);
	EXPECT_EQ(PostDominatedBBs.size(), 1UL);

	// BB3 is unreachable. It should have no dominators nor postdominators.
	DominatedBBs.clear();
	PostDominatedBBs.clear();
	DT->getDescendants(BB3, DominatedBBs);
	DT->getDescendants(BB3, PostDominatedBBs);
	EXPECT_EQ(DominatedBBs.size(), 0UL);
	EXPECT_EQ(PostDominatedBBs.size(), 0UL);

	// Check DFS Numbers before
	DT->updateDFSNumbers();
	EXPECT_EQ(DT->getNode(BB0)->getDFSNumIn(), 0UL);
	EXPECT_EQ(DT->getNode(BB0)->getDFSNumOut(), 7UL);
	EXPECT_EQ(DT->getNode(BB1)->getDFSNumIn(), 1UL);
	EXPECT_EQ(DT->getNode(BB1)->getDFSNumOut(), 2UL);
	EXPECT_EQ(DT->getNode(BB2)->getDFSNumIn(), 5UL);
	EXPECT_EQ(DT->getNode(BB2)->getDFSNumOut(), 6UL);
	EXPECT_EQ(DT->getNode(BB4)->getDFSNumIn(), 3UL);
	EXPECT_EQ(DT->getNode(BB4)->getDFSNumOut(), 4UL);

	// Check levels before
	EXPECT_EQ(DT->getNode(BB0)->getLevel(), 0U);
	EXPECT_EQ(DT->getNode(BB1)->getLevel(), 1U);
	EXPECT_EQ(DT->getNode(BB2)->getLevel(), 1U);
	EXPECT_EQ(DT->getNode(BB4)->getLevel(), 1U);

	// Reattach block 3 to block 1 and recalculate
	BB1->getTerminator()->eraseFromParent();
	BranchInst::Create(BB4, BB3, ConstantInt::getTrue(F.getContext()), BB1);
	DT->recalculate(F);

	// Check DFS Numbers after
	DT->updateDFSNumbers();
	EXPECT_EQ(DT->getNode(BB0)->getDFSNumIn(), 0UL);
	EXPECT_EQ(DT->getNode(BB0)->getDFSNumOut(), 9UL);
	EXPECT_EQ(DT->getNode(BB1)->getDFSNumIn(), 1UL);
	EXPECT_EQ(DT->getNode(BB1)->getDFSNumOut(), 4UL);
	EXPECT_EQ(DT->getNode(BB2)->getDFSNumIn(), 7UL);
	EXPECT_EQ(DT->getNode(BB2)->getDFSNumOut(), 8UL);
	EXPECT_EQ(DT->getNode(BB3)->getDFSNumIn(), 2UL);
	EXPECT_EQ(DT->getNode(BB3)->getDFSNumOut(), 3UL);
	EXPECT_EQ(DT->getNode(BB4)->getDFSNumIn(), 5UL);
	EXPECT_EQ(DT->getNode(BB4)->getDFSNumOut(), 6UL);

	// Check levels after
	EXPECT_EQ(DT->getNode(BB0)->getLevel(), 0U);
	EXPECT_EQ(DT->getNode(BB1)->getLevel(), 1U);
	EXPECT_EQ(DT->getNode(BB2)->getLevel(), 1U);
	EXPECT_EQ(DT->getNode(BB3)->getLevel(), 2U);
	EXPECT_EQ(DT->getNode(BB4)->getLevel(), 1U);

	// Change root node
	DT->verifyDomTree();
	BasicBlock *NewEntry =
	BasicBlock::Create(F.getContext(), "new_entry", &F, BB0);
	BranchInst::Create(BB0, NewEntry);
	EXPECT_EQ(F.begin()->getName(), NewEntry->getName());
	EXPECT_TRUE(&F.getEntryBlock() == NewEntry);
	DT->setNewRoot(NewEntry);
	DT->verifyDomTree();
	});
	}

	TEST(DominatorTree, NonUniqueEdges) {
	StringRef ModuleString =
	"define i32 @f(i32 %i, i32 *%p) {\n"
	"bb0:\n"
	" store i32 %i, i32 *%p\n"
	" switch i32 %i, label %bb2 [\n"
	" i32 0, label %bb1\n"
	" i32 1, label %bb1\n"
	" ]\n"
	" bb1:\n"
	" ret i32 1\n"
	" bb2:\n"
	" ret i32 4\n"
	"}\n";

	// Parse the module.
	LLVMContext Context;
	std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);

	runWithDomTree(
	M, "f", [&](Function &F, DominatorTree DT, PostDomTree *PDT) {
	Function::iterator FI = F.begin();

	BasicBlock BB0 = &FI++;
	BasicBlock BB1 = &FI++;
	BasicBlock BB2 = &FI++;

	const TerminatorInst *TI = BB0->getTerminator();
	assert(TI->getNumSuccessors() == 3 && "Switch has three successors");

	BasicBlockEdge Edge_BB0_BB2(BB0, TI->getSuccessor(0));
	assert(Edge_BB0_BB2.getEnd() == BB2 &&
	"Default label is the 1st successor");

	BasicBlockEdge Edge_BB0_BB1_a(BB0, TI->getSuccessor(1));
	assert(Edge_BB0_BB1_a.getEnd() == BB1 && "BB1 is the 2nd successor");

	BasicBlockEdge Edge_BB0_BB1_b(BB0, TI->getSuccessor(2));
	assert(Edge_BB0_BB1_b.getEnd() == BB1 && "BB1 is the 3rd successor");

	EXPECT_TRUE(DT->dominates(Edge_BB0_BB2, BB2));
	EXPECT_FALSE(DT->dominates(Edge_BB0_BB2, BB1));

	EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_a, BB1));
	EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_b, BB1));

	EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_a, BB2));
	EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_b, BB2));
	});
	}

	// Verify that the PDT is correctly updated in case an edge removal results
	// in a new unreachable CFG node. Also make sure that the updated PDT is the
	// same as a freshly recalculated one.
	//
	// For the following input code and initial PDT:
	//
	// CFG PDT
	//
	// A Exit
	// \| \|
	// _B D
	// / \| \ \|
	// ^ v \ B
	// \ / D / \
	// C \ C A
	// v
	// Exit
	//
	// we verify that CFG' and PDT-updated is obtained after removal of edge C -> B.
	//
	// CFG' PDT-updated
	//
	// A Exit
	// \| / \| \
	// B C B D
	// \| \ \|
	// v \ A
	// / D
	// C \
	// \| \
	// unreachable Exit
	//
	// Both the blocks that end with ret and with unreachable become trivial
	// PostDomTree roots, as they have no successors.
	//
	TEST(DominatorTree, DeletingEdgesIntroducesUnreachables) {
	StringRef ModuleString =
	"define void @f() {\n"
	"A:\n"
	" br label %B\n"
	"B:\n"
	" br i1 undef, label %D, label %C\n"
	"C:\n"
	" br label %B\n"
	"D:\n"
	" ret void\n"
	"}\n";

	// Parse the module.
	LLVMContext Context;
	std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);

	runWithDomTree(
	M, "f", [&](Function &F, DominatorTree DT, PostDomTree *PDT) {
	Function::iterator FI = F.begin();

	FI++;
	BasicBlock B = &FI++;
	BasicBlock C = &FI++;
	BasicBlock D = &FI++;

	ASSERT_TRUE(PDT->dominates(PDT->getNode(D), PDT->getNode(B)));
	EXPECT_TRUE(DT->verify());
	EXPECT_TRUE(PDT->verify());

	C->getTerminator()->eraseFromParent();
	new UnreachableInst(C->getContext(), C);

	DT->deleteEdge(C, B);
	PDT->deleteEdge(C, B);

	EXPECT_TRUE(DT->verify());
	EXPECT_TRUE(PDT->verify());

	EXPECT_FALSE(PDT->dominates(PDT->getNode(D), PDT->getNode(B)));
	EXPECT_NE(PDT->getNode(C), nullptr);

	DominatorTree NDT(F);
	EXPECT_EQ(DT->compare(NDT), 0);

	PostDomTree NPDT(F);
	EXPECT_EQ(PDT->compare(NPDT), 0);
	});
	}

	// Verify that the PDT is correctly updated in case an edge removal results
	// in an infinite loop. Also make sure that the updated PDT is the
	// same as a freshly recalculated one.
	//
	// Test case:
	//
	// CFG PDT
	//
	// A Exit
	// \| \|
	// _B D
	// / \| \ \|
	// ^ v \ B
	// \ / D / \
	// C \ C A
	// / \ v
	// ^ v Exit
	// \_/
	//
	// After deleting the edge C->B, C is part of an infinite reverse-unreachable
	// loop:
	//
	// CFG' PDT'
	//
	// A Exit
	// \| / \| \
	// B C B D
	// \| \ \|
	// v \ A
	// / D
	// C \
	// / \ v
	// ^ v Exit
	// \_/
	//
	// As C now becomes reverse-unreachable, it forms a new non-trivial root and
	// gets connected to the virtual exit.
	// D does not postdominate B anymore, because there are two forward paths from
	// B to the virtual exit:
	// - B -> C -> VirtualExit
	// - B -> D -> VirtualExit.
	//
	TEST(DominatorTree, DeletingEdgesIntroducesInfiniteLoop) {
	StringRef ModuleString =
	"define void @f() {\n"
	"A:\n"
	" br label %B\n"
	"B:\n"
	" br i1 undef, label %D, label %C\n"
	"C:\n"
	" switch i32 undef, label %C [\n"
	" i32 0, label %B\n"
	" ]\n"
	"D:\n"
	" ret void\n"
	"}\n";

	// Parse the module.
	LLVMContext Context;
	std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);

	runWithDomTree(
	M, "f", [&](Function &F, DominatorTree DT, PostDomTree *PDT) {
	Function::iterator FI = F.begin();

	FI++;
	BasicBlock B = &FI++;
	BasicBlock C = &FI++;
	BasicBlock D = &FI++;

	ASSERT_TRUE(PDT->dominates(PDT->getNode(D), PDT->getNode(B)));
	EXPECT_TRUE(DT->verify());
	EXPECT_TRUE(PDT->verify());

	auto SwitchC = cast<SwitchInst>(C->getTerminator());
	SwitchC->removeCase(SwitchC->case_begin());
	DT->deleteEdge(C, B);
	EXPECT_TRUE(DT->verify());
	PDT->deleteEdge(C, B);
	EXPECT_TRUE(PDT->verify());

	EXPECT_FALSE(PDT->dominates(PDT->getNode(D), PDT->getNode(B)));
	EXPECT_NE(PDT->getNode(C), nullptr);

	DominatorTree NDT(F);
	EXPECT_EQ(DT->compare(NDT), 0);

	PostDomTree NPDT(F);
	EXPECT_EQ(PDT->compare(NPDT), 0);
	});
	}

	// Verify that the PDT is correctly updated in case an edge removal results
	// in an infinite loop.
	//
	// Test case:
	//
	// CFG PDT
	//
	// A Exit
	// \| / \| \
	// B-- C2 B D
	// \| \ / \|
	// v \ C A
	// / D
	// C--C2 \
	// / \ \ v
	// ^ v --Exit
	// \_/
	//
	// After deleting the edge C->E, C is part of an infinite reverse-unreachable
	// loop:
	//
	// CFG' PDT'
	//
	// A Exit
	// \| / \| \
	// B C B D
	// \| \ \|
	// v \ A
	// / D
	// C \
	// / \ v
	// ^ v Exit
	// \_/
	//
	// In PDT, D does not post-dominate B. After the edge C -> C2 is removed,
	// C becomes a new nontrivial PDT root.
	//
	TEST(DominatorTree, DeletingEdgesIntroducesInfiniteLoop2) {
	StringRef ModuleString =
	"define void @f() {\n"
	"A:\n"
	" br label %B\n"
	"B:\n"
	" br i1 undef, label %D, label %C\n"
	"C:\n"
	" switch i32 undef, label %C [\n"
	" i32 0, label %C2\n"
	" ]\n"
	"C2:\n"
	" ret void\n"
	"D:\n"
	" ret void\n"
	"}\n";

	// Parse the module.
	LLVMContext Context;
	std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);

	runWithDomTree(
	M, "f", [&](Function &F, DominatorTree DT, PostDomTree *PDT) {
	Function::iterator FI = F.begin();

	FI++;
	BasicBlock B = &FI++;
	BasicBlock C = &FI++;
	BasicBlock C2 = &FI++;
	BasicBlock D = &FI++;

	EXPECT_TRUE(DT->verify());
	EXPECT_TRUE(PDT->verify());

	auto SwitchC = cast<SwitchInst>(C->getTerminator());
	SwitchC->removeCase(SwitchC->case_begin());
	DT->deleteEdge(C, C2);
	PDT->deleteEdge(C, C2);
	C2->removeFromParent();

	EXPECT_EQ(DT->getNode(C2), nullptr);
	PDT->eraseNode(C2);
	delete C2;

	EXPECT_TRUE(DT->verify());
	EXPECT_TRUE(PDT->verify());

	EXPECT_FALSE(PDT->dominates(PDT->getNode(D), PDT->getNode(B)));
	EXPECT_NE(PDT->getNode(C), nullptr);

	DominatorTree NDT(F);
	EXPECT_EQ(DT->compare(NDT), 0);

	PostDomTree NPDT(F);
	EXPECT_EQ(PDT->compare(NPDT), 0);
	});
	}

	namespace {
	const auto Insert = CFGBuilder::ActionKind::Insert;
	const auto Delete = CFGBuilder::ActionKind::Delete;

	bool CompUpdates(const CFGBuilder::Update &A, const CFGBuilder::Update &B) {
	return std::tie(A.Action, A.Edge.From, A.Edge.To) <
	std::tie(B.Action, B.Edge.From, B.Edge.To);
	}
	} // namespace

	TEST(DominatorTree, InsertReachable) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};

	std::vector<CFGBuilder::Update> Updates = {{Insert, {"12", "10"}},
	{Insert, {"10", "9"}},
	{Insert, {"7", "6"}},
	{Insert, {"7", "5"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	EXPECT_EQ(LastUpdate->Action, Insert);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.insertEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.insertEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}
	}

	TEST(DominatorTree, InsertReachable2) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"7", "5"}, {"2", "8"}, {"8", "11"}, {"11", "12"}, {"12", "10"},
	{"10", "9"}, {"9", "10"}};

	std::vector<CFGBuilder::Update> Updates = {{Insert, {"10", "7"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate = B.applyUpdate();
	EXPECT_TRUE(LastUpdate);

	EXPECT_EQ(LastUpdate->Action, Insert);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.insertEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.insertEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}

	TEST(DominatorTree, InsertUnreachable) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {{"1", "2"}, {"2", "3"}, {"3", "4"},
	{"5", "6"}, {"5", "7"}, {"3", "8"},
	{"9", "10"}, {"11", "12"}};

	std::vector<CFGBuilder::Update> Updates = {{Insert, {"4", "5"}},
	{Insert, {"8", "9"}},
	{Insert, {"10", "12"}},
	{Insert, {"10", "11"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	EXPECT_EQ(LastUpdate->Action, Insert);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.insertEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.insertEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}
	}

	TEST(DominatorTree, InsertFromUnreachable) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {{"1", "2"}, {"2", "3"}, {"3", "4"}};

	std::vector<CFGBuilder::Update> Updates = {{Insert, {"3", "5"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate = B.applyUpdate();
	EXPECT_TRUE(LastUpdate);

	EXPECT_EQ(LastUpdate->Action, Insert);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	PDT.insertEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	EXPECT_TRUE(PDT.getRoots().size() == 2);
	EXPECT_NE(PDT.getNode(B.getOrAddBlock("5")), nullptr);
	}

	TEST(DominatorTree, InsertMixed) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"5", "6"}, {"5", "7"},
	{"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}};

	std::vector<CFGBuilder::Update> Updates = {
	{Insert, {"4", "5"}}, {Insert, {"2", "5"}}, {Insert, {"10", "9"}},
	{Insert, {"12", "10"}}, {Insert, {"12", "10"}}, {Insert, {"7", "8"}},
	{Insert, {"7", "5"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	EXPECT_EQ(LastUpdate->Action, Insert);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.insertEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.insertEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}
	}

	TEST(DominatorTree, InsertPermut) {
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"5", "6"}, {"5", "7"},
	{"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}};

	std::vector<CFGBuilder::Update> Updates = {{Insert, {"4", "5"}},
	{Insert, {"2", "5"}},
	{Insert, {"10", "9"}},
	{Insert, {"12", "10"}}};

	while (std::next_permutation(Updates.begin(), Updates.end(), CompUpdates)) {
	CFGHolder Holder;
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	EXPECT_EQ(LastUpdate->Action, Insert);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.insertEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.insertEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}
	}
	}

	TEST(DominatorTree, DeleteReachable) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"2", "4"}, {"3", "4"}, {"4", "5"}, {"5", "6"},
	{"5", "7"}, {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}};

	std::vector<CFGBuilder::Update> Updates = {
	{Delete, {"2", "4"}}, {Delete, {"7", "8"}}, {Delete, {"10", "2"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	EXPECT_EQ(LastUpdate->Action, Delete);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.deleteEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.deleteEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}
	}

	TEST(DominatorTree, DeleteUnreachable) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}};

	std::vector<CFGBuilder::Update> Updates = {
	{Delete, {"8", "9"}}, {Delete, {"7", "8"}}, {Delete, {"3", "4"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	EXPECT_EQ(LastUpdate->Action, Delete);
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	DT.deleteEdge(From, To);
	EXPECT_TRUE(DT.verify());
	PDT.deleteEdge(From, To);
	EXPECT_TRUE(PDT.verify());
	}
	}

	TEST(DominatorTree, DeletionsInSubtrees) {
	CFGHolder Holder;
	std::vector<CFGBuilder::Arc> Arcs = {{"0", "1"}, {"1", "2"}, {"1", "3"},
	{"1", "6"}, {"3", "4"}, {"2", "5"},
	{"5", "2"}};

	// It is possible to perform multiple deletions and inform the
	// DominatorTree about them at the same time, if the all of the
	// deletions happen in different subtrees.
	std::vector<CFGBuilder::Update> Updates = {{Delete, {"1", "2"}},
	{Delete, {"1", "3"}}};
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate()))
	;

	DT.deleteEdge(B.getOrAddBlock("1"), B.getOrAddBlock("2"));
	DT.deleteEdge(B.getOrAddBlock("1"), B.getOrAddBlock("3"));

	EXPECT_TRUE(DT.verify());
	EXPECT_EQ(DT.getNode(B.getOrAddBlock("2")), nullptr);
	EXPECT_EQ(DT.getNode(B.getOrAddBlock("3")), nullptr);
	EXPECT_EQ(DT.getNode(B.getOrAddBlock("4")), nullptr);
	EXPECT_EQ(DT.getNode(B.getOrAddBlock("5")), nullptr);
	EXPECT_NE(DT.getNode(B.getOrAddBlock("6")), nullptr);
	}

	TEST(DominatorTree, InsertDelete) {
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};

	std::vector<CFGBuilder::Update> Updates = {
	{Insert, {"2", "4"}}, {Insert, {"12", "10"}}, {Insert, {"10", "9"}},
	{Insert, {"7", "6"}}, {Insert, {"7", "5"}}, {Delete, {"3", "8"}},
	{Insert, {"10", "7"}}, {Insert, {"2", "8"}}, {Delete, {"3", "4"}},
	{Delete, {"8", "9"}}, {Delete, {"11", "12"}}};

	CFGHolder Holder;
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	if (LastUpdate->Action == Insert) {
	DT.insertEdge(From, To);
	PDT.insertEdge(From, To);
	} else {
	DT.deleteEdge(From, To);
	PDT.deleteEdge(From, To);
	}

	EXPECT_TRUE(DT.verify());
	EXPECT_TRUE(PDT.verify());
	}
	}

	TEST(DominatorTree, InsertDeleteExhaustive) {
	std::vector<CFGBuilder::Arc> Arcs = {
	{"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"},
	{"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};

	std::vector<CFGBuilder::Update> Updates = {
	{Insert, {"2", "4"}}, {Insert, {"12", "10"}}, {Insert, {"10", "9"}},
	{Insert, {"7", "6"}}, {Insert, {"7", "5"}}, {Delete, {"3", "8"}},
	{Insert, {"10", "7"}}, {Insert, {"2", "8"}}, {Delete, {"3", "4"}},
	{Delete, {"8", "9"}}, {Delete, {"11", "12"}}};

	std::mt19937 Generator(0);
	for (unsigned i = 0; i < 16; ++i) {
	std::shuffle(Updates.begin(), Updates.end(), Generator);
	CFGHolder Holder;
	CFGBuilder B(Holder.F, Arcs, Updates);
	DominatorTree DT(*Holder.F);
	EXPECT_TRUE(DT.verify());
	PostDomTree PDT(*Holder.F);
	EXPECT_TRUE(PDT.verify());

	Optional<CFGBuilder::Update> LastUpdate;
	while ((LastUpdate = B.applyUpdate())) {
	BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
	BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
	if (LastUpdate->Action == Insert) {
	DT.insertEdge(From, To);
	PDT.insertEdge(From, To);
	} else {
	DT.deleteEdge(From, To);
	PDT.deleteEdge(From, To);
	}

	EXPECT_TRUE(DT.verify());
	EXPECT_TRUE(PDT.verify());
	}
	}
	}
	+
	+TEST(DominatorTree, InsertIntoIrreducible) {
	+ std::vector<CFGBuilder::Arc> Arcs = {
	+ {"0", "1"},
	+ {"1", "27"}, {"1", "7"},
	+ {"10", "18"},
	+ {"13", "10"},
	+ {"18", "13"}, {"18", "23"},
	+ {"23", "13"}, {"23", "24"},
	+ {"24", "1"}, {"24", "18"},
	+ {"27", "24"}};
	+
	+ CFGHolder Holder;
	+ CFGBuilder B(Holder.F, Arcs, {{Insert, {"7", "23"}}});
	+ DominatorTree DT(*Holder.F);
	+ EXPECT_TRUE(DT.verify());
	+
	+ B.applyUpdate();
	+ BasicBlock *From = B.getOrAddBlock("7");
	+ BasicBlock *To = B.getOrAddBlock("23");
	+ DT.insertEdge(From, To);
	+
	+ EXPECT_TRUE(DT.verify());
	+}
	+
	Index: vendor/llvm/dist-release_60/utils/release/test-release.sh
	===================================================================
	--- vendor/llvm/dist-release_60/utils/release/test-release.sh (revision 328361)
	+++ vendor/llvm/dist-release_60/utils/release/test-release.sh (revision 328362)
	@@ -1,601 +1,609 @@
	#!/usr/bin/env bash
	#===-- test-release.sh - Test the LLVM release candidates ------------------===#
	#
	# The LLVM Compiler Infrastructure
	#
	# This file is distributed under the University of Illinois Open Source
	# License.
	#
	#===------------------------------------------------------------------------===#
	#
	# Download, build, and test the release candidate for an LLVM release.
	#
	#===------------------------------------------------------------------------===#

	System=`uname -s`
	if [ "$System" = "FreeBSD" ]; then
	MAKE=gmake
	else
	MAKE=make
	fi

	# Base SVN URL for the sources.
	Base_url="http://llvm.org/svn/llvm-project"

	Release=""
	Release_no_dot=""
	RC=""
	Triple=""
	use_gzip="no"
	do_checkout="yes"
	do_debug="no"
	do_asserts="no"
	do_compare="yes"
	do_rt="yes"
	do_libs="yes"
	+do_libcxxabi="yes"
	do_libunwind="yes"
	do_test_suite="yes"
	do_openmp="yes"
	do_lld="yes"
	do_lldb="no"
	do_polly="yes"
	BuildDir="`pwd`"
	ExtraConfigureFlags=""
	ExportBranch=""

	function usage() {
	echo "usage: `basename $0` -release X.Y.Z -rc NUM [OPTIONS]"
	echo ""
	echo " -release X.Y.Z The release version to test."
	echo " -rc NUM The pre-release candidate number."
	echo " -final The final release candidate."
	echo " -triple TRIPLE The target triple for this machine."
	echo " -j NUM Number of compile jobs to run. [default: 3]"
	echo " -build-dir DIR Directory to perform testing in. [default: pwd]"
	echo " -no-checkout Don't checkout the sources from SVN."
	echo " -test-debug Test the debug build. [default: no]"
	echo " -test-asserts Test with asserts on. [default: no]"
	echo " -no-compare-files Don't test that phase 2 and 3 files are identical."
	echo " -use-gzip Use gzip instead of xz."
	echo " -configure-flags FLAGS Extra flags to pass to the configure step."
	echo " -svn-path DIR Use the specified DIR instead of a release."
	echo " For example -svn-path trunk or -svn-path branches/release_37"
	echo " -no-rt Disable check-out & build Compiler-RT"
	echo " -no-libs Disable check-out & build libcxx/libcxxabi/libunwind"
	+ echo " -no-libcxxabi Disable check-out & build libcxxabi"
	echo " -no-libunwind Disable check-out & build libunwind"
	echo " -no-test-suite Disable check-out & build test-suite"
	echo " -no-openmp Disable check-out & build libomp"
	echo " -no-lld Disable check-out & build lld"
	echo " -lldb Enable check-out & build lldb"
	echo " -no-lldb Disable check-out & build lldb (default)"
	echo " -no-polly Disable check-out & build Polly"
	}

	while [ $# -gt 0 ]; do
	case $1 in
	-release \| --release )
	shift
	Release="$1"
	Release_no_dot="`echo $1 \| sed -e 's,\.,,g'`"
	;;
	-rc \| --rc \| -RC \| --RC )
	shift
	RC="rc$1"
	;;
	-final \| --final )
	RC=final
	;;
	-svn-path \| --svn-path )
	shift
	Release="test"
	Release_no_dot="test"
	ExportBranch="$1"
	RC="`echo $ExportBranch \| sed -e 's,/,_,g'`"
	echo "WARNING: Using the branch $ExportBranch instead of a release tag"
	echo " This is intended to aid new packagers in trialing "
	echo " builds without requiring a tag to be created first"
	;;
	-triple \| --triple )
	shift
	Triple="$1"
	;;
	-configure-flags \| --configure-flags )
	shift
	ExtraConfigureFlags="$1"
	;;
	-j* )
	NumJobs="`echo $1 \| sed -e 's,-j$[0-9]*$,\1,g'`"
	if [ -z "$NumJobs" ]; then
	shift
	NumJobs="$1"
	fi
	;;
	-build-dir \| --build-dir \| -builddir \| --builddir )
	shift
	BuildDir="$1"
	;;
	-no-checkout \| --no-checkout )
	do_checkout="no"
	;;
	-test-debug \| --test-debug )
	do_debug="yes"
	;;
	-test-asserts \| --test-asserts )
	do_asserts="yes"
	;;
	-no-compare-files \| --no-compare-files )
	do_compare="no"
	;;
	-use-gzip \| --use-gzip )
	use_gzip="yes"
	;;
	-no-rt )
	do_rt="no"
	;;
	-no-libs )
	do_libs="no"
	;;
	+ -no-libcxxabi )
	+ do_libcxxabi="no"
	+ ;;
	-no-libunwind )
	do_libunwind="no"
	;;
	-no-test-suite )
	do_test_suite="no"
	;;
	-no-openmp )
	do_openmp="no"
	;;
	-no-lld )
	do_lld="no"
	;;
	-lldb )
	do_lldb="yes"
	;;
	-no-lldb )
	do_lldb="no"
	;;
	-no-polly )
	do_polly="no"
	;;
	-help \| --help \| -h \| --h \| -\? )
	usage
	exit 0
	;;
	* )
	echo "unknown option: $1"
	usage
	exit 1
	;;
	esac
	shift
	done

	# Check required arguments.
	if [ -z "$Release" ]; then
	echo "error: no release number specified"
	exit 1
	fi
	if [ -z "$RC" ]; then
	echo "error: no release candidate number specified"
	exit 1
	fi
	if [ -z "$ExportBranch" ]; then
	ExportBranch="tags/RELEASE_$Release_no_dot/$RC"
	fi
	if [ -z "$Triple" ]; then
	echo "error: no target triple specified"
	exit 1
	fi

	# Figure out how many make processes to run.
	if [ -z "$NumJobs" ]; then
	NumJobs=`sysctl -n hw.activecpu 2> /dev/null \|\| true`
	fi
	if [ -z "$NumJobs" ]; then
	NumJobs=`sysctl -n hw.ncpu 2> /dev/null \|\| true`
	fi
	if [ -z "$NumJobs" ]; then
	NumJobs=`grep -c processor /proc/cpuinfo 2> /dev/null \|\| true`
	fi
	if [ -z "$NumJobs" ]; then
	NumJobs=3
	fi

	# Projects list
	projects="llvm cfe clang-tools-extra"
	if [ $do_rt = "yes" ]; then
	projects="$projects compiler-rt"
	fi
	if [ $do_libs = "yes" ]; then
	- projects="$projects libcxx libcxxabi"
	+ projects="$projects libcxx"
	+ if [ $do_libcxxabi = "yes" ]; then
	+ projects="$projects libcxxabi"
	+ fi
	if [ $do_libunwind = "yes" ]; then
	projects="$projects libunwind"
	fi
	fi
	case $do_test_suite in
	yes\|export-only)
	projects="$projects test-suite"
	;;
	esac
	if [ $do_openmp = "yes" ]; then
	projects="$projects openmp"
	fi
	if [ $do_lld = "yes" ]; then
	projects="$projects lld"
	fi
	if [ $do_lldb = "yes" ]; then
	projects="$projects lldb"
	fi
	if [ $do_polly = "yes" ]; then
	projects="$projects polly"
	fi

	# Go to the build directory (may be different from CWD)
	BuildDir=$BuildDir/$RC
	mkdir -p $BuildDir
	cd $BuildDir

	# Location of log files.
	LogDir=$BuildDir/logs
	mkdir -p $LogDir

	# Final package name.
	Package=clang+llvm-$Release
	if [ $RC != "final" ]; then
	Package=$Package-$RC
	fi
	Package=$Package-$Triple

	# Errors to be highlighted at the end are written to this file.
	echo -n > $LogDir/deferred_errors.log

	function deferred_error() {
	Phase="$1"
	Flavor="$2"
	Msg="$3"
	echo "[${Flavor} Phase${Phase}] ${Msg}" \| tee -a $LogDir/deferred_errors.log
	}

	# Make sure that a required program is available
	function check_program_exists() {
	local program="$1"
	if ! type -P $program > /dev/null 2>&1 ; then
	echo "program '$1' not found !"
	exit 1
	fi
	}

	if [ "$System" != "Darwin" ]; then
	check_program_exists 'chrpath'
	check_program_exists 'file'
	check_program_exists 'objdump'
	fi

	# Make sure that the URLs are valid.
	function check_valid_urls() {
	for proj in $projects ; do
	echo "# Validating $proj SVN URL"

	if ! svn ls $Base_url/$proj/$ExportBranch > /dev/null 2>&1 ; then
	echo "$proj does not have a $ExportBranch branch/tag!"
	exit 1
	fi
	done
	}

	# Export sources to the build directory.
	function export_sources() {
	check_valid_urls

	for proj in $projects ; do
	case $proj in
	llvm)
	projsrc=$proj.src
	;;
	cfe)
	projsrc=llvm.src/tools/clang
	;;
	lld\|lldb\|polly)
	projsrc=llvm.src/tools/$proj
	;;
	clang-tools-extra)
	projsrc=llvm.src/tools/clang/tools/extra
	;;
	compiler-rt\|libcxx\|libcxxabi\|libunwind\|openmp)
	projsrc=llvm.src/projects/$proj
	;;
	test-suite)
	projsrc=$proj.src
	;;
	*)
	echo "error: unknown project $proj"
	exit 1
	;;
	esac

	if [ -d $projsrc ]; then
	echo "# Reusing $proj $Release-$RC sources in $projsrc"
	continue
	fi
	echo "# Exporting $proj $Release-$RC sources to $projsrc"
	if ! svn export -q $Base_url/$proj/$ExportBranch $projsrc ; then
	echo "error: failed to export $proj project"
	exit 1
	fi
	done

	cd $BuildDir
	}

	function configure_llvmCore() {
	Phase="$1"
	Flavor="$2"
	ObjDir="$3"

	case $Flavor in
	Release )
	BuildType="Release"
	Assertions="OFF"
	;;
	Release+Asserts )
	BuildType="Release"
	Assertions="ON"
	;;
	Debug )
	BuildType="Debug"
	Assertions="ON"
	;;
	* )
	echo "# Invalid flavor '$Flavor'"
	echo ""
	return
	;;
	esac

	echo "# Using C compiler: $c_compiler"
	echo "# Using C++ compiler: $cxx_compiler"

	cd $ObjDir
	echo "# Configuring llvm $Release-$RC $Flavor"

	echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
	cmake -G "Unix Makefiles" \
	-DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
	$ExtraConfigureFlags $BuildDir/llvm.src \
	2>&1 \| tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
	env CC="$c_compiler" CXX="$cxx_compiler" \
	cmake -G "Unix Makefiles" \
	-DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
	$ExtraConfigureFlags $BuildDir/llvm.src \
	2>&1 \| tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log

	cd $BuildDir
	}

	function build_llvmCore() {
	Phase="$1"
	Flavor="$2"
	ObjDir="$3"
	DestDir="$4"

	cd $ObjDir
	echo "# Compiling llvm $Release-$RC $Flavor"
	echo "# ${MAKE} -j $NumJobs VERBOSE=1"
	${MAKE} -j $NumJobs VERBOSE=1 \
	2>&1 \| tee $LogDir/llvm.make-Phase$Phase-$Flavor.log

	echo "# Installing llvm $Release-$RC $Flavor"
	echo "# ${MAKE} install"
	${MAKE} install \
	DESTDIR="${DestDir}" \
	2>&1 \| tee $LogDir/llvm.install-Phase$Phase-$Flavor.log
	cd $BuildDir
	}

	function test_llvmCore() {
	Phase="$1"
	Flavor="$2"
	ObjDir="$3"

	cd $ObjDir
	if ! ( ${MAKE} -j $NumJobs -k check-all \
	2>&1 \| tee $LogDir/llvm.check-Phase$Phase-$Flavor.log ) ; then
	deferred_error $Phase $Flavor "check-all failed"
	fi

	if [ $do_test_suite = 'yes' ]; then
	cd $TestSuiteBuildDir
	env CC="$c_compiler" CXX="$cxx_compiler" \
	cmake $TestSuiteSrcDir -DTEST_SUITE_LIT=$Lit
	if ! ( ${MAKE} -j $NumJobs -k check \
	2>&1 \| tee $LogDir/llvm.check-Phase$Phase-$Flavor.log ) ; then
	deferred_error $Phase $Flavor "test suite failed"
	fi
	fi
	cd $BuildDir
	}

	# Clean RPATH. Libtool adds the build directory to the search path, which is
	# not necessary --- and even harmful --- for the binary packages we release.
	function clean_RPATH() {
	if [ "$System" = "Darwin" ]; then
	return
	fi
	local InstallPath="$1"
	for Candidate in `find $InstallPath/{bin,lib} -type f`; do
	if file $Candidate \| grep ELF \| egrep 'executable\|shared object' > /dev/null 2>&1 ; then
	if rpath=`objdump -x $Candidate \| grep 'RPATH'` ; then
	rpath=`echo $rpath \| sed -e's/^ RPATH //'`
	if [ -n "$rpath" ]; then
	newrpath=`echo $rpath \| sed -e's/.$\$ORIGIN[^:]$.*/\1/'`
	chrpath -r $newrpath $Candidate 2>&1 > /dev/null 2>&1
	fi
	fi
	fi
	done
	}

	# Create a package of the release binaries.
	function package_release() {
	cwd=`pwd`
	cd $BuildDir/Phase3/Release
	mv llvmCore-$Release-$RC.install/usr/local $Package
	if [ "$use_gzip" = "yes" ]; then
	tar cfz $BuildDir/$Package.tar.gz $Package
	else
	tar cfJ $BuildDir/$Package.tar.xz $Package
	fi
	mv $Package llvmCore-$Release-$RC.install/usr/local
	cd $cwd
	}

	# Exit if any command fails
	# Note: pipefail is necessary for running build commands through
	# a pipe (i.e. it changes the output of ``false \| tee /dev/null ; echo $?``)
	set -e
	set -o pipefail

	if [ "$do_checkout" = "yes" ]; then
	export_sources
	fi

	# Setup the test-suite. Do this early so we can catch failures before
	# we do the full 3 stage build.
	if [ $do_test_suite = "yes" ]; then
	SandboxDir="$BuildDir/sandbox"
	Lit=$SandboxDir/bin/lit
	TestSuiteBuildDir="$BuildDir/test-suite-build"
	TestSuiteSrcDir="$BuildDir/test-suite.src"

	virtualenv $SandboxDir
	$SandboxDir/bin/python $BuildDir/llvm.src/utils/lit/setup.py install
	mkdir -p $TestSuiteBuildDir
	fi

	(
	Flavors="Release"
	if [ "$do_debug" = "yes" ]; then
	Flavors="Debug $Flavors"
	fi
	if [ "$do_asserts" = "yes" ]; then
	Flavors="$Flavors Release+Asserts"
	fi

	for Flavor in $Flavors ; do
	echo ""
	echo ""
	echo "********************************************************************************"
	echo " Release: $Release-$RC"
	echo " Build: $Flavor"
	echo " System Info: "
	echo " `uname -a`"
	echo "********************************************************************************"
	echo ""

	c_compiler="$CC"
	cxx_compiler="$CXX"
	llvmCore_phase1_objdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.obj
	llvmCore_phase1_destdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.install

	llvmCore_phase2_objdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.obj
	llvmCore_phase2_destdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.install

	llvmCore_phase3_objdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.obj
	llvmCore_phase3_destdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.install

	rm -rf $llvmCore_phase1_objdir
	rm -rf $llvmCore_phase1_destdir

	rm -rf $llvmCore_phase2_objdir
	rm -rf $llvmCore_phase2_destdir

	rm -rf $llvmCore_phase3_objdir
	rm -rf $llvmCore_phase3_destdir

	mkdir -p $llvmCore_phase1_objdir
	mkdir -p $llvmCore_phase1_destdir

	mkdir -p $llvmCore_phase2_objdir
	mkdir -p $llvmCore_phase2_destdir

	mkdir -p $llvmCore_phase3_objdir
	mkdir -p $llvmCore_phase3_destdir

	############################################################################
	# Phase 1: Build llvmCore and clang
	echo "# Phase 1: Building llvmCore"
	configure_llvmCore 1 $Flavor $llvmCore_phase1_objdir
	build_llvmCore 1 $Flavor \
	$llvmCore_phase1_objdir $llvmCore_phase1_destdir
	clean_RPATH $llvmCore_phase1_destdir/usr/local

	########################################################################
	# Phase 2: Build llvmCore with newly built clang from phase 1.
	c_compiler=$llvmCore_phase1_destdir/usr/local/bin/clang
	cxx_compiler=$llvmCore_phase1_destdir/usr/local/bin/clang++
	echo "# Phase 2: Building llvmCore"
	configure_llvmCore 2 $Flavor $llvmCore_phase2_objdir
	build_llvmCore 2 $Flavor \
	$llvmCore_phase2_objdir $llvmCore_phase2_destdir
	clean_RPATH $llvmCore_phase2_destdir/usr/local

	########################################################################
	# Phase 3: Build llvmCore with newly built clang from phase 2.
	c_compiler=$llvmCore_phase2_destdir/usr/local/bin/clang
	cxx_compiler=$llvmCore_phase2_destdir/usr/local/bin/clang++
	echo "# Phase 3: Building llvmCore"
	configure_llvmCore 3 $Flavor $llvmCore_phase3_objdir
	build_llvmCore 3 $Flavor \
	$llvmCore_phase3_objdir $llvmCore_phase3_destdir
	clean_RPATH $llvmCore_phase3_destdir/usr/local

	########################################################################
	# Testing: Test phase 3
	c_compiler=$llvmCore_phase3_destdir/usr/local/bin/clang
	cxx_compiler=$llvmCore_phase3_destdir/usr/local/bin/clang++
	echo "# Testing - built with clang"
	test_llvmCore 3 $Flavor $llvmCore_phase3_objdir

	########################################################################
	# Compare .o files between Phase2 and Phase3 and report which ones
	# differ.
	if [ "$do_compare" = "yes" ]; then
	echo
	echo "# Comparing Phase 2 and Phase 3 files"
	for p2 in `find $llvmCore_phase2_objdir -name '*.o'` ; do
	p3=`echo $p2 \| sed -e 's,Phase2,Phase3,'`
	# Substitute 'Phase2' for 'Phase3' in the Phase 2 object file in
	# case there are build paths in the debug info. On some systems,
	# sed adds a newline to the output, so pass $p3 through sed too.
	if ! cmp -s \
	<(env LC_CTYPE=C sed -e 's,Phase2,Phase3,g' -e 's,Phase1,Phase2,g' $p2) \
	<(env LC_CTYPE=C sed -e '' $p3) 16 16; then
	echo "file `basename $p2` differs between phase 2 and phase 3"
	fi
	done
	fi
	done

	) 2>&1 \| tee $LogDir/testing.$Release-$RC.log

	if [ "$use_gzip" = "yes" ]; then
	echo "# Packaging the release as $Package.tar.gz"
	else
	echo "# Packaging the release as $Package.tar.xz"
	fi
	package_release

	set +e

	# Woo hoo!
	echo "### Testing Finished ###"
	echo "### Logs: $LogDir"

	echo "### Errors:"
	if [ -s "$LogDir/deferred_errors.log" ]; then
	cat "$LogDir/deferred_errors.log"
	exit 1
	else
	echo "None."
	fi

	exit 0

File Metadata

Mime Type: application/octet-stream
Expires: Thu, May 30, 5:26 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: mcSHh_lvar4J
Default Alt Text: (6 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions